Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <petscconf.h>
9: #include <../src/mat/impls/aij/seq/aij.h>
10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: #include <thrust/async/for_each.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22: typedef enum {
23: CUSPARSE_MV_ALG_DEFAULT = 0,
24: CUSPARSE_COOMV_ALG = 1,
25: CUSPARSE_CSRMV_ALG1 = 2,
26: CUSPARSE_CSRMV_ALG2 = 3
27: } cusparseSpMVAlg_t;
29: typedef enum {
30: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
32: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
33: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
34: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
35: CUSPARSE_SPMM_ALG_DEFAULT = 0,
36: CUSPARSE_SPMM_COO_ALG1 = 1,
37: CUSPARSE_SPMM_COO_ALG2 = 2,
38: CUSPARSE_SPMM_COO_ALG3 = 3,
39: CUSPARSE_SPMM_COO_ALG4 = 5,
40: CUSPARSE_SPMM_CSR_ALG1 = 4,
41: CUSPARSE_SPMM_CSR_ALG2 = 6,
42: } cusparseSpMMAlg_t;
44: typedef enum {
45: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
47: } cusparseCsr2CscAlg_t;
48: */
49: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52: #endif
54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
87: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
90: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93: {
94: cusparseStatus_t stat;
95: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99: cusparsestruct->stream = stream;
100: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101: return(0);
102: }
104: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105: {
106: cusparseStatus_t stat;
107: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111: if (cusparsestruct->handle != handle) {
112: if (cusparsestruct->handle) {
113: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114: }
115: cusparsestruct->handle = handle;
116: }
117: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118: return(0);
119: }
121: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122: {
123: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124: PetscBool flg;
125: PetscErrorCode ierr;
128: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
129: if (!flg || !cusparsestruct) return(0);
130: if (cusparsestruct->handle) cusparsestruct->handle = 0;
131: return(0);
132: }
134: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135: {
137: *type = MATSOLVERCUSPARSE;
138: return(0);
139: }
141: /*MC
142: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147: algorithms are not recommended. This class does NOT support direct solver operations.
149: Level: beginner
151: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152: M*/
154: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155: {
157: PetscInt n = A->rmap->n;
160: MatCreate(PetscObjectComm((PetscObject)A),B);
161: MatSetSizes(*B,n,n,n,n);
162: (*B)->factortype = ftype;
163: MatSetType(*B,MATSEQAIJCUSPARSE);
165: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166: MatSetBlockSizesFromMats(*B,A,A);
167: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
170: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
171: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
172: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
176: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
177: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
179: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
180: (*B)->canuseordering = PETSC_TRUE;
181: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
182: return(0);
183: }
185: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186: {
187: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
190: switch (op) {
191: case MAT_CUSPARSE_MULT:
192: cusparsestruct->format = format;
193: break;
194: case MAT_CUSPARSE_ALL:
195: cusparsestruct->format = format;
196: break;
197: default:
198: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199: }
200: return(0);
201: }
203: /*@
204: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205: operation. Only the MatMult operation can use different GPU storage formats
206: for MPIAIJCUSPARSE matrices.
207: Not Collective
209: Input Parameters:
210: + A - Matrix of type SEQAIJCUSPARSE
211: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214: Output Parameter:
216: Level: intermediate
218: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219: @*/
220: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221: {
226: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
227: return(0);
228: }
230: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231: {
235: switch (op) {
236: case MAT_FORM_EXPLICIT_TRANSPOSE:
237: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238: if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
239: A->form_explicit_transpose = flg;
240: break;
241: default:
242: MatSetOption_SeqAIJ(A,op,flg);
243: break;
244: }
245: return(0);
246: }
248: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251: {
252: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
253: IS isrow = b->row,iscol = b->col;
254: PetscBool row_identity,col_identity;
258: MatSeqAIJCUSPARSECopyFromGPU(A);
259: MatLUFactorNumeric_SeqAIJ(B,A,info);
260: B->offloadmask = PETSC_OFFLOAD_CPU;
261: /* determine which version of MatSolve needs to be used. */
262: ISIdentity(isrow,&row_identity);
263: ISIdentity(iscol,&col_identity);
264: if (row_identity && col_identity) {
265: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267: B->ops->matsolve = NULL;
268: B->ops->matsolvetranspose = NULL;
269: } else {
270: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272: B->ops->matsolve = NULL;
273: B->ops->matsolvetranspose = NULL;
274: }
276: /* get the triangular factors */
277: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
278: return(0);
279: }
281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283: PetscErrorCode ierr;
284: MatCUSPARSEStorageFormat format;
285: PetscBool flg;
286: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
289: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
290: if (A->factortype == MAT_FACTOR_NONE) {
291: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
293: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
295: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
297: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
298: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
301: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303: if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304: #else
305: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306: #endif
307: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
309: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
311: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
313: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314: #endif
315: }
316: PetscOptionsTail();
317: return(0);
318: }
320: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321: {
322: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323: PetscErrorCode ierr;
326: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
327: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
328: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329: return(0);
330: }
332: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333: {
334: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335: PetscErrorCode ierr;
338: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
339: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
340: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341: return(0);
342: }
344: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345: {
346: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347: PetscErrorCode ierr;
350: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
351: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
352: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353: return(0);
354: }
356: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357: {
358: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359: PetscErrorCode ierr;
362: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
363: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
364: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365: return(0);
366: }
368: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369: {
370: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
371: PetscInt n = A->rmap->n;
372: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374: cusparseStatus_t stat;
375: const PetscInt *ai = a->i,*aj = a->j,*vi;
376: const MatScalar *aa = a->a,*v;
377: PetscInt *AiLo, *AjLo;
378: PetscInt i,nz, nzLower, offset, rowOffset;
379: PetscErrorCode ierr;
380: cudaError_t cerr;
383: if (!n) return(0);
384: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385: try {
386: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387: nzLower=n+ai[n]-ai[1];
388: if (!loTriFactor) {
389: PetscScalar *AALo;
391: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
393: /* Allocate Space for the lower triangular matrix */
394: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
397: /* Fill the lower triangular matrix */
398: AiLo[0] = (PetscInt) 0;
399: AiLo[n] = nzLower;
400: AjLo[0] = (PetscInt) 0;
401: AALo[0] = (MatScalar) 1.0;
402: v = aa;
403: vi = aj;
404: offset = 1;
405: rowOffset= 1;
406: for (i=1; i<n; i++) {
407: nz = ai[i+1] - ai[i];
408: /* additional 1 for the term on the diagonal */
409: AiLo[i] = rowOffset;
410: rowOffset += nz+1;
412: PetscArraycpy(&(AjLo[offset]), vi, nz);
413: PetscArraycpy(&(AALo[offset]), v, nz);
415: offset += nz;
416: AjLo[offset] = (PetscInt) i;
417: AALo[offset] = (MatScalar) 1.0;
418: offset += 1;
420: v += nz;
421: vi += nz;
422: }
424: /* allocate space for the triangular factor information */
425: PetscNew(&loTriFactor);
426: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427: /* Create the matrix description */
428: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432: #else
433: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434: #endif
435: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
438: /* set the operation */
439: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
441: /* set the matrix */
442: loTriFactor->csrMat = new CsrMatrix;
443: loTriFactor->csrMat->num_rows = n;
444: loTriFactor->csrMat->num_cols = n;
445: loTriFactor->csrMat->num_entries = nzLower;
447: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
450: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
453: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
456: /* Create the solve analysis information */
457: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
458: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466: #endif
468: /* perform the solve analysis */
469: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472: loTriFactor->csrMat->column_indices->data().get(),
473: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474: loTriFactor->solveInfo,
475: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476: #else
477: loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478: #endif
479: cerr = WaitForCUDA();CHKERRCUDA(cerr);
480: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
482: /* assign the pointer */
483: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484: loTriFactor->AA_h = AALo;
485: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
488: } else { /* update values only */
489: if (!loTriFactor->AA_h) {
490: cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491: }
492: /* Fill the lower triangular matrix */
493: loTriFactor->AA_h[0] = 1.0;
494: v = aa;
495: vi = aj;
496: offset = 1;
497: for (i=1; i<n; i++) {
498: nz = ai[i+1] - ai[i];
499: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
500: offset += nz;
501: loTriFactor->AA_h[offset] = 1.0;
502: offset += 1;
503: v += nz;
504: }
505: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506: PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
507: }
508: } catch(char *ex) {
509: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510: }
511: }
512: return(0);
513: }
515: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516: {
517: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
518: PetscInt n = A->rmap->n;
519: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521: cusparseStatus_t stat;
522: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
523: const MatScalar *aa = a->a,*v;
524: PetscInt *AiUp, *AjUp;
525: PetscInt i,nz, nzUpper, offset;
526: PetscErrorCode ierr;
527: cudaError_t cerr;
530: if (!n) return(0);
531: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532: try {
533: /* next, figure out the number of nonzeros in the upper triangular matrix. */
534: nzUpper = adiag[0]-adiag[n];
535: if (!upTriFactor) {
536: PetscScalar *AAUp;
538: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
540: /* Allocate Space for the upper triangular matrix */
541: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
544: /* Fill the upper triangular matrix */
545: AiUp[0]=(PetscInt) 0;
546: AiUp[n]=nzUpper;
547: offset = nzUpper;
548: for (i=n-1; i>=0; i--) {
549: v = aa + adiag[i+1] + 1;
550: vi = aj + adiag[i+1] + 1;
552: /* number of elements NOT on the diagonal */
553: nz = adiag[i] - adiag[i+1]-1;
555: /* decrement the offset */
556: offset -= (nz+1);
558: /* first, set the diagonal elements */
559: AjUp[offset] = (PetscInt) i;
560: AAUp[offset] = (MatScalar)1./v[nz];
561: AiUp[i] = AiUp[i+1] - (nz+1);
563: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
564: PetscArraycpy(&(AAUp[offset+1]), v, nz);
565: }
567: /* allocate space for the triangular factor information */
568: PetscNew(&upTriFactor);
569: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
571: /* Create the matrix description */
572: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576: #else
577: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578: #endif
579: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
582: /* set the operation */
583: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
585: /* set the matrix */
586: upTriFactor->csrMat = new CsrMatrix;
587: upTriFactor->csrMat->num_rows = n;
588: upTriFactor->csrMat->num_cols = n;
589: upTriFactor->csrMat->num_entries = nzUpper;
591: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
594: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
597: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
600: /* Create the solve analysis information */
601: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
602: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610: #endif
612: /* perform the solve analysis */
613: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616: upTriFactor->csrMat->column_indices->data().get(),
617: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618: upTriFactor->solveInfo,
619: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620: #else
621: upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622: #endif
623: cerr = WaitForCUDA();CHKERRCUDA(cerr);
624: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
626: /* assign the pointer */
627: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628: upTriFactor->AA_h = AAUp;
629: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
632: } else {
633: if (!upTriFactor->AA_h) {
634: cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635: }
636: /* Fill the upper triangular matrix */
637: offset = nzUpper;
638: for (i=n-1; i>=0; i--) {
639: v = aa + adiag[i+1] + 1;
641: /* number of elements NOT on the diagonal */
642: nz = adiag[i] - adiag[i+1]-1;
644: /* decrement the offset */
645: offset -= (nz+1);
647: /* first, set the diagonal elements */
648: upTriFactor->AA_h[offset] = 1./v[nz];
649: PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
650: }
651: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652: PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
653: }
654: } catch(char *ex) {
655: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656: }
657: }
658: return(0);
659: }
661: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662: {
663: PetscErrorCode ierr;
664: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
665: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666: IS isrow = a->row,iscol = a->icol;
667: PetscBool row_identity,col_identity;
668: PetscInt n = A->rmap->n;
671: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
673: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
675: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676: cusparseTriFactors->nnz=a->nz;
678: A->offloadmask = PETSC_OFFLOAD_BOTH;
679: /* lower triangular indices */
680: ISIdentity(isrow,&row_identity);
681: if (!row_identity && !cusparseTriFactors->rpermIndices) {
682: const PetscInt *r;
684: ISGetIndices(isrow,&r);
685: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686: cusparseTriFactors->rpermIndices->assign(r, r+n);
687: ISRestoreIndices(isrow,&r);
688: PetscLogCpuToGpu(n*sizeof(PetscInt));
689: }
691: /* upper triangular indices */
692: ISIdentity(iscol,&col_identity);
693: if (!col_identity && !cusparseTriFactors->cpermIndices) {
694: const PetscInt *c;
696: ISGetIndices(iscol,&c);
697: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698: cusparseTriFactors->cpermIndices->assign(c, c+n);
699: ISRestoreIndices(iscol,&c);
700: PetscLogCpuToGpu(n*sizeof(PetscInt));
701: }
702: return(0);
703: }
705: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706: {
707: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
708: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711: cusparseStatus_t stat;
712: PetscErrorCode ierr;
713: cudaError_t cerr;
714: PetscInt *AiUp, *AjUp;
715: PetscScalar *AAUp;
716: PetscScalar *AALo;
717: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
719: const PetscInt *ai = b->i,*aj = b->j,*vj;
720: const MatScalar *aa = b->a,*v;
723: if (!n) return(0);
724: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725: try {
726: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728: if (!upTriFactor && !loTriFactor) {
729: /* Allocate Space for the upper triangular matrix */
730: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
733: /* Fill the upper triangular matrix */
734: AiUp[0]=(PetscInt) 0;
735: AiUp[n]=nzUpper;
736: offset = 0;
737: for (i=0; i<n; i++) {
738: /* set the pointers */
739: v = aa + ai[i];
740: vj = aj + ai[i];
741: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
743: /* first, set the diagonal elements */
744: AjUp[offset] = (PetscInt) i;
745: AAUp[offset] = (MatScalar)1.0/v[nz];
746: AiUp[i] = offset;
747: AALo[offset] = (MatScalar)1.0/v[nz];
749: offset+=1;
750: if (nz>0) {
751: PetscArraycpy(&(AjUp[offset]), vj, nz);
752: PetscArraycpy(&(AAUp[offset]), v, nz);
753: for (j=offset; j<offset+nz; j++) {
754: AAUp[j] = -AAUp[j];
755: AALo[j] = AAUp[j]/v[nz];
756: }
757: offset+=nz;
758: }
759: }
761: /* allocate space for the triangular factor information */
762: PetscNew(&upTriFactor);
763: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
765: /* Create the matrix description */
766: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770: #else
771: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772: #endif
773: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
776: /* set the matrix */
777: upTriFactor->csrMat = new CsrMatrix;
778: upTriFactor->csrMat->num_rows = A->rmap->n;
779: upTriFactor->csrMat->num_cols = A->cmap->n;
780: upTriFactor->csrMat->num_entries = a->nz;
782: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
785: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
788: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
791: /* set the operation */
792: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
794: /* Create the solve analysis information */
795: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
796: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804: #endif
806: /* perform the solve analysis */
807: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810: upTriFactor->csrMat->column_indices->data().get(),
811: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812: upTriFactor->solveInfo,
813: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814: #else
815: upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816: #endif
817: cerr = WaitForCUDA();CHKERRCUDA(cerr);
818: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
820: /* assign the pointer */
821: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
823: /* allocate space for the triangular factor information */
824: PetscNew(&loTriFactor);
825: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
827: /* Create the matrix description */
828: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832: #else
833: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834: #endif
835: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
838: /* set the operation */
839: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
841: /* set the matrix */
842: loTriFactor->csrMat = new CsrMatrix;
843: loTriFactor->csrMat->num_rows = A->rmap->n;
844: loTriFactor->csrMat->num_cols = A->cmap->n;
845: loTriFactor->csrMat->num_entries = a->nz;
847: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
850: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
853: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
856: /* Create the solve analysis information */
857: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
858: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866: #endif
868: /* perform the solve analysis */
869: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872: loTriFactor->csrMat->column_indices->data().get(),
873: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874: loTriFactor->solveInfo,
875: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876: #else
877: loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878: #endif
879: cerr = WaitForCUDA();CHKERRCUDA(cerr);
880: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
882: /* assign the pointer */
883: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
885: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
886: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888: } else {
889: /* Fill the upper triangular matrix */
890: offset = 0;
891: for (i=0; i<n; i++) {
892: /* set the pointers */
893: v = aa + ai[i];
894: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
896: /* first, set the diagonal elements */
897: AAUp[offset] = 1.0/v[nz];
898: AALo[offset] = 1.0/v[nz];
900: offset+=1;
901: if (nz>0) {
902: PetscArraycpy(&(AAUp[offset]), v, nz);
903: for (j=offset; j<offset+nz; j++) {
904: AAUp[j] = -AAUp[j];
905: AALo[j] = AAUp[j]/v[nz];
906: }
907: offset+=nz;
908: }
909: }
910: if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911: if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914: PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
915: }
916: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918: } catch(char *ex) {
919: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920: }
921: }
922: return(0);
923: }
925: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926: {
927: PetscErrorCode ierr;
928: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
929: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930: IS ip = a->row;
931: PetscBool perm_identity;
932: PetscInt n = A->rmap->n;
935: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
937: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
940: A->offloadmask = PETSC_OFFLOAD_BOTH;
942: /* lower triangular indices */
943: ISIdentity(ip,&perm_identity);
944: if (!perm_identity) {
945: IS iip;
946: const PetscInt *irip,*rip;
948: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
949: ISGetIndices(iip,&irip);
950: ISGetIndices(ip,&rip);
951: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955: ISRestoreIndices(iip,&irip);
956: ISDestroy(&iip);
957: ISRestoreIndices(ip,&rip);
958: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
959: }
960: return(0);
961: }
963: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964: {
965: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
966: IS ip = b->row;
967: PetscBool perm_identity;
971: MatSeqAIJCUSPARSECopyFromGPU(A);
972: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
973: B->offloadmask = PETSC_OFFLOAD_CPU;
974: /* determine which version of MatSolve needs to be used. */
975: ISIdentity(ip,&perm_identity);
976: if (perm_identity) {
977: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979: B->ops->matsolve = NULL;
980: B->ops->matsolvetranspose = NULL;
981: } else {
982: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984: B->ops->matsolve = NULL;
985: B->ops->matsolvetranspose = NULL;
986: }
988: /* get the triangular factors */
989: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
990: return(0);
991: }
993: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994: {
995: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000: cusparseStatus_t stat;
1001: cusparseIndexBase_t indexBase;
1002: cusparseMatrixType_t matrixType;
1003: cusparseFillMode_t fillMode;
1004: cusparseDiagType_t diagType;
1005: cudaError_t cerr;
1006: PetscErrorCode ierr;
1009: /* allocate space for the transpose of the lower triangular factor */
1010: PetscNew(&loTriFactorT);
1011: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1013: /* set the matrix descriptors of the lower triangular factor */
1014: matrixType = cusparseGetMatType(loTriFactor->descr);
1015: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1020: /* Create the matrix description */
1021: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1027: /* set the operation */
1028: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1030: /* allocate GPU space for the CSC of the lower triangular factor*/
1031: loTriFactorT->csrMat = new CsrMatrix;
1032: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1033: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1034: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1035: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1039: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043: loTriFactor->csrMat->values->data().get(),
1044: loTriFactor->csrMat->row_offsets->data().get(),
1045: loTriFactor->csrMat->column_indices->data().get(),
1046: loTriFactorT->csrMat->values->data().get(),
1047: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048: CUSPARSE_ACTION_NUMERIC,indexBase,
1049: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050: cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051: #endif
1053: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1054: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056: loTriFactor->csrMat->values->data().get(),
1057: loTriFactor->csrMat->row_offsets->data().get(),
1058: loTriFactor->csrMat->column_indices->data().get(),
1059: loTriFactorT->csrMat->values->data().get(),
1060: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062: CUSPARSE_ACTION_NUMERIC, indexBase,
1063: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064: #else
1065: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067: #endif
1068: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1071: /* Create the solve analysis information */
1072: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1073: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081: #endif
1083: /* perform the solve analysis */
1084: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087: loTriFactorT->csrMat->column_indices->data().get(),
1088: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089: loTriFactorT->solveInfo,
1090: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091: #else
1092: loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093: #endif
1094: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1097: /* assign the pointer */
1098: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100: /*********************************************/
1101: /* Now the Transpose of the Upper Tri Factor */
1102: /*********************************************/
1104: /* allocate space for the transpose of the upper triangular factor */
1105: PetscNew(&upTriFactorT);
1106: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108: /* set the matrix descriptors of the upper triangular factor */
1109: matrixType = cusparseGetMatType(upTriFactor->descr);
1110: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115: /* Create the matrix description */
1116: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122: /* set the operation */
1123: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125: /* allocate GPU space for the CSC of the upper triangular factor*/
1126: upTriFactorT->csrMat = new CsrMatrix;
1127: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1128: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1129: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1130: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138: upTriFactor->csrMat->values->data().get(),
1139: upTriFactor->csrMat->row_offsets->data().get(),
1140: upTriFactor->csrMat->column_indices->data().get(),
1141: upTriFactorT->csrMat->values->data().get(),
1142: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143: CUSPARSE_ACTION_NUMERIC,indexBase,
1144: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146: #endif
1148: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1149: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151: upTriFactor->csrMat->values->data().get(),
1152: upTriFactor->csrMat->row_offsets->data().get(),
1153: upTriFactor->csrMat->column_indices->data().get(),
1154: upTriFactorT->csrMat->values->data().get(),
1155: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157: CUSPARSE_ACTION_NUMERIC, indexBase,
1158: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159: #else
1160: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162: #endif
1164: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1167: /* Create the solve analysis information */
1168: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1169: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177: #endif
1179: /* perform the solve analysis */
1180: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183: upTriFactorT->csrMat->column_indices->data().get(),
1184: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185: upTriFactorT->solveInfo,
1186: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187: #else
1188: upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189: #endif
1191: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1194: /* assign the pointer */
1195: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196: return(0);
1197: }
1199: struct PetscScalarToPetscInt
1200: {
1201: __host__ __device__
1202: PetscInt operator()(PetscScalar s)
1203: {
1204: return (PetscInt)PetscRealPart(s);
1205: }
1206: };
1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1209: {
1210: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1213: cusparseStatus_t stat;
1214: cusparseIndexBase_t indexBase;
1215: cudaError_t err;
1216: PetscErrorCode ierr;
1219: MatSeqAIJCUSPARSECopyToGPU(A);
1220: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1221: if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1222: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1223: if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1224: if (A->transupdated) return(0);
1225: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1226: PetscLogGpuTimeBegin();
1227: if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229: }
1230: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1237: /* set alpha and beta */
1238: err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246: CsrMatrix *matrixT = new CsrMatrix;
1247: matstructT->mat = matrixT;
1248: matrixT->num_rows = A->cmap->n;
1249: matrixT->num_cols = A->rmap->n;
1250: matrixT->num_entries = a->nz;
1251: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253: matrixT->values = new THRUSTARRAY(a->nz);
1255: if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1258: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259: #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1260: stat = cusparseCreateCsr(&matstructT->matDescr,
1261: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263: matrixT->values->data().get(),
1264: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266: #else
1267: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1268: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1271: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1272: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1273: */
1274: if (matrixT->num_entries) {
1275: stat = cusparseCreateCsr(&matstructT->matDescr,
1276: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278: matrixT->values->data().get(),
1279: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1280: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1282: } else {
1283: matstructT->matDescr = NULL;
1284: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1285: }
1286: #endif
1287: #endif
1288: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1289: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1291: #else
1292: CsrMatrix *temp = new CsrMatrix;
1293: CsrMatrix *tempT = new CsrMatrix;
1294: /* First convert HYB to CSR */
1295: temp->num_rows = A->rmap->n;
1296: temp->num_cols = A->cmap->n;
1297: temp->num_entries = a->nz;
1298: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1300: temp->values = new THRUSTARRAY(a->nz);
1302: stat = cusparse_hyb2csr(cusparsestruct->handle,
1303: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1304: temp->values->data().get(),
1305: temp->row_offsets->data().get(),
1306: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1308: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1309: tempT->num_rows = A->rmap->n;
1310: tempT->num_cols = A->cmap->n;
1311: tempT->num_entries = a->nz;
1312: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1313: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1314: tempT->values = new THRUSTARRAY(a->nz);
1316: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1317: temp->num_cols, temp->num_entries,
1318: temp->values->data().get(),
1319: temp->row_offsets->data().get(),
1320: temp->column_indices->data().get(),
1321: tempT->values->data().get(),
1322: tempT->column_indices->data().get(),
1323: tempT->row_offsets->data().get(),
1324: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1326: /* Last, convert CSC to HYB */
1327: cusparseHybMat_t hybMat;
1328: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1329: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1330: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1331: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1332: matstructT->descr, tempT->values->data().get(),
1333: tempT->row_offsets->data().get(),
1334: tempT->column_indices->data().get(),
1335: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1337: /* assign the pointer */
1338: matstructT->mat = hybMat;
1339: A->transupdated = PETSC_TRUE;
1340: /* delete temporaries */
1341: if (tempT) {
1342: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1343: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1344: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1345: delete (CsrMatrix*) tempT;
1346: }
1347: if (temp) {
1348: if (temp->values) delete (THRUSTARRAY*) temp->values;
1349: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1350: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1351: delete (CsrMatrix*) temp;
1352: }
1353: #endif
1354: }
1355: }
1356: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1357: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1358: CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1359: if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1360: if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1361: if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1362: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1363: if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1364: if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1365: if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1366: if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1367: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1368: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1369: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1370: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1371: }
1372: if (!cusparsestruct->csr2csc_i) {
1373: THRUSTARRAY csr2csc_a(matrix->num_entries);
1374: PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1376: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1377: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378: void *csr2cscBuffer;
1379: size_t csr2cscBufferSize;
1380: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1381: A->cmap->n, matrix->num_entries,
1382: matrix->values->data().get(),
1383: cusparsestruct->rowoffsets_gpu->data().get(),
1384: matrix->column_indices->data().get(),
1385: matrixT->values->data().get(),
1386: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387: CUSPARSE_ACTION_NUMERIC,indexBase,
1388: cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1389: err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1390: #endif
1392: if (matrix->num_entries) {
1393: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1394: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1395: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1397: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1398: should be filled with indexBase. So I just take a shortcut here.
1399: */
1400: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1401: A->cmap->n,matrix->num_entries,
1402: csr2csc_a.data().get(),
1403: cusparsestruct->rowoffsets_gpu->data().get(),
1404: matrix->column_indices->data().get(),
1405: matrixT->values->data().get(),
1406: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1408: CUSPARSE_ACTION_NUMERIC,indexBase,
1409: cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1410: #else
1411: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1412: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1413: #endif
1414: } else {
1415: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1416: }
1418: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1419: PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1420: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1421: err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1422: #endif
1423: }
1424: PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1425: thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1426: matrixT->values->begin()));
1427: }
1428: PetscLogGpuTimeEnd();
1429: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1430: /* the compressed row indices is not used for matTranspose */
1431: matstructT->cprowIndices = NULL;
1432: /* assign the pointer */
1433: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1434: A->transupdated = PETSC_TRUE;
1435: return(0);
1436: }
1438: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1439: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1440: {
1441: PetscInt n = xx->map->n;
1442: const PetscScalar *barray;
1443: PetscScalar *xarray;
1444: thrust::device_ptr<const PetscScalar> bGPU;
1445: thrust::device_ptr<PetscScalar> xGPU;
1446: cusparseStatus_t stat;
1447: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1448: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1449: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1450: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1451: PetscErrorCode ierr;
1454: /* Analyze the matrix and create the transpose ... on the fly */
1455: if (!loTriFactorT && !upTriFactorT) {
1456: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1457: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459: }
1461: /* Get the GPU pointers */
1462: VecCUDAGetArrayWrite(xx,&xarray);
1463: VecCUDAGetArrayRead(bb,&barray);
1464: xGPU = thrust::device_pointer_cast(xarray);
1465: bGPU = thrust::device_pointer_cast(barray);
1467: PetscLogGpuTimeBegin();
1468: /* First, reorder with the row permutation */
1469: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1470: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1471: xGPU);
1473: /* First, solve U */
1474: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1475: upTriFactorT->csrMat->num_rows,
1476: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477: upTriFactorT->csrMat->num_entries,
1478: #endif
1479: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1480: upTriFactorT->csrMat->values->data().get(),
1481: upTriFactorT->csrMat->row_offsets->data().get(),
1482: upTriFactorT->csrMat->column_indices->data().get(),
1483: upTriFactorT->solveInfo,
1484: xarray,
1485: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486: tempGPU->data().get(),
1487: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1488: #else
1489: tempGPU->data().get());CHKERRCUSPARSE(stat);
1490: #endif
1492: /* Then, solve L */
1493: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1494: loTriFactorT->csrMat->num_rows,
1495: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1496: loTriFactorT->csrMat->num_entries,
1497: #endif
1498: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1499: loTriFactorT->csrMat->values->data().get(),
1500: loTriFactorT->csrMat->row_offsets->data().get(),
1501: loTriFactorT->csrMat->column_indices->data().get(),
1502: loTriFactorT->solveInfo,
1503: tempGPU->data().get(),
1504: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505: xarray,
1506: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1507: #else
1508: xarray);CHKERRCUSPARSE(stat);
1509: #endif
1511: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1512: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1513: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1514: tempGPU->begin());
1516: /* Copy the temporary to the full solution. */
1517: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1519: /* restore */
1520: VecCUDARestoreArrayRead(bb,&barray);
1521: VecCUDARestoreArrayWrite(xx,&xarray);
1522: PetscLogGpuTimeEnd();
1523: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1524: return(0);
1525: }
1527: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1528: {
1529: const PetscScalar *barray;
1530: PetscScalar *xarray;
1531: cusparseStatus_t stat;
1532: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1533: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1534: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1535: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1536: PetscErrorCode ierr;
1539: /* Analyze the matrix and create the transpose ... on the fly */
1540: if (!loTriFactorT && !upTriFactorT) {
1541: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1542: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1543: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1544: }
1546: /* Get the GPU pointers */
1547: VecCUDAGetArrayWrite(xx,&xarray);
1548: VecCUDAGetArrayRead(bb,&barray);
1550: PetscLogGpuTimeBegin();
1551: /* First, solve U */
1552: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1553: upTriFactorT->csrMat->num_rows,
1554: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555: upTriFactorT->csrMat->num_entries,
1556: #endif
1557: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1558: upTriFactorT->csrMat->values->data().get(),
1559: upTriFactorT->csrMat->row_offsets->data().get(),
1560: upTriFactorT->csrMat->column_indices->data().get(),
1561: upTriFactorT->solveInfo,
1562: barray,
1563: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564: tempGPU->data().get(),
1565: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566: #else
1567: tempGPU->data().get());CHKERRCUSPARSE(stat);
1568: #endif
1570: /* Then, solve L */
1571: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1572: loTriFactorT->csrMat->num_rows,
1573: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1574: loTriFactorT->csrMat->num_entries,
1575: #endif
1576: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1577: loTriFactorT->csrMat->values->data().get(),
1578: loTriFactorT->csrMat->row_offsets->data().get(),
1579: loTriFactorT->csrMat->column_indices->data().get(),
1580: loTriFactorT->solveInfo,
1581: tempGPU->data().get(),
1582: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1583: xarray,
1584: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1585: #else
1586: xarray);CHKERRCUSPARSE(stat);
1587: #endif
1589: /* restore */
1590: VecCUDARestoreArrayRead(bb,&barray);
1591: VecCUDARestoreArrayWrite(xx,&xarray);
1592: PetscLogGpuTimeEnd();
1593: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1594: return(0);
1595: }
1597: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1598: {
1599: const PetscScalar *barray;
1600: PetscScalar *xarray;
1601: thrust::device_ptr<const PetscScalar> bGPU;
1602: thrust::device_ptr<PetscScalar> xGPU;
1603: cusparseStatus_t stat;
1604: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1605: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1606: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1607: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1608: PetscErrorCode ierr;
1612: /* Get the GPU pointers */
1613: VecCUDAGetArrayWrite(xx,&xarray);
1614: VecCUDAGetArrayRead(bb,&barray);
1615: xGPU = thrust::device_pointer_cast(xarray);
1616: bGPU = thrust::device_pointer_cast(barray);
1618: PetscLogGpuTimeBegin();
1619: /* First, reorder with the row permutation */
1620: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1621: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1622: tempGPU->begin());
1624: /* Next, solve L */
1625: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1626: loTriFactor->csrMat->num_rows,
1627: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628: loTriFactor->csrMat->num_entries,
1629: #endif
1630: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1631: loTriFactor->csrMat->values->data().get(),
1632: loTriFactor->csrMat->row_offsets->data().get(),
1633: loTriFactor->csrMat->column_indices->data().get(),
1634: loTriFactor->solveInfo,
1635: tempGPU->data().get(),
1636: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637: xarray,
1638: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1639: #else
1640: xarray);CHKERRCUSPARSE(stat);
1641: #endif
1643: /* Then, solve U */
1644: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1645: upTriFactor->csrMat->num_rows,
1646: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647: upTriFactor->csrMat->num_entries,
1648: #endif
1649: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1650: upTriFactor->csrMat->values->data().get(),
1651: upTriFactor->csrMat->row_offsets->data().get(),
1652: upTriFactor->csrMat->column_indices->data().get(),
1653: upTriFactor->solveInfo,xarray,
1654: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655: tempGPU->data().get(),
1656: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1657: #else
1658: tempGPU->data().get());CHKERRCUSPARSE(stat);
1659: #endif
1661: /* Last, reorder with the column permutation */
1662: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1663: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1664: xGPU);
1666: VecCUDARestoreArrayRead(bb,&barray);
1667: VecCUDARestoreArrayWrite(xx,&xarray);
1668: PetscLogGpuTimeEnd();
1669: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1670: return(0);
1671: }
1673: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1674: {
1675: const PetscScalar *barray;
1676: PetscScalar *xarray;
1677: cusparseStatus_t stat;
1678: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1679: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1680: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1681: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1682: PetscErrorCode ierr;
1685: /* Get the GPU pointers */
1686: VecCUDAGetArrayWrite(xx,&xarray);
1687: VecCUDAGetArrayRead(bb,&barray);
1689: PetscLogGpuTimeBegin();
1690: /* First, solve L */
1691: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1692: loTriFactor->csrMat->num_rows,
1693: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694: loTriFactor->csrMat->num_entries,
1695: #endif
1696: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1697: loTriFactor->csrMat->values->data().get(),
1698: loTriFactor->csrMat->row_offsets->data().get(),
1699: loTriFactor->csrMat->column_indices->data().get(),
1700: loTriFactor->solveInfo,
1701: barray,
1702: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1703: tempGPU->data().get(),
1704: loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1705: #else
1706: tempGPU->data().get());CHKERRCUSPARSE(stat);
1707: #endif
1709: /* Next, solve U */
1710: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1711: upTriFactor->csrMat->num_rows,
1712: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713: upTriFactor->csrMat->num_entries,
1714: #endif
1715: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1716: upTriFactor->csrMat->values->data().get(),
1717: upTriFactor->csrMat->row_offsets->data().get(),
1718: upTriFactor->csrMat->column_indices->data().get(),
1719: upTriFactor->solveInfo,
1720: tempGPU->data().get(),
1721: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1722: xarray,
1723: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1724: #else
1725: xarray);CHKERRCUSPARSE(stat);
1726: #endif
1728: VecCUDARestoreArrayRead(bb,&barray);
1729: VecCUDARestoreArrayWrite(xx,&xarray);
1730: PetscLogGpuTimeEnd();
1731: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1732: return(0);
1733: }
1735: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1736: {
1737: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1738: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1739: cudaError_t cerr;
1740: PetscErrorCode ierr;
1743: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1744: CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1746: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1747: cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1748: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1749: PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1750: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1751: A->offloadmask = PETSC_OFFLOAD_BOTH;
1752: }
1753: return(0);
1754: }
1756: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1757: {
1758: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1762: MatSeqAIJCUSPARSECopyFromGPU(A);
1763: *array = a->a;
1764: A->offloadmask = PETSC_OFFLOAD_CPU;
1765: return(0);
1766: }
1768: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1769: {
1770: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1771: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1772: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1773: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1774: PetscErrorCode ierr;
1775: cusparseStatus_t stat;
1776: PetscBool both = PETSC_TRUE;
1777: cudaError_t err;
1780: if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1781: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1782: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1783: CsrMatrix *matrix;
1784: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1786: if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1787: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1788: matrix->values->assign(a->a, a->a+a->nz);
1789: err = WaitForCUDA();CHKERRCUDA(err);
1790: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1791: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1792: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1793: } else {
1794: PetscInt nnz;
1795: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1796: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1797: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1798: delete cusparsestruct->workVector;
1799: delete cusparsestruct->rowoffsets_gpu;
1800: cusparsestruct->workVector = NULL;
1801: cusparsestruct->rowoffsets_gpu = NULL;
1802: try {
1803: if (a->compressedrow.use) {
1804: m = a->compressedrow.nrows;
1805: ii = a->compressedrow.i;
1806: ridx = a->compressedrow.rindex;
1807: } else {
1808: m = A->rmap->n;
1809: ii = a->i;
1810: ridx = NULL;
1811: }
1812: if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1813: if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1814: if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1815: else nnz = a->nz;
1817: /* create cusparse matrix */
1818: cusparsestruct->nrows = m;
1819: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1820: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1821: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1822: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1824: err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1825: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1826: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1827: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1828: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1829: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1830: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1832: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1833: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1834: /* set the matrix */
1835: CsrMatrix *mat= new CsrMatrix;
1836: mat->num_rows = m;
1837: mat->num_cols = A->cmap->n;
1838: mat->num_entries = nnz;
1839: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1840: mat->row_offsets->assign(ii, ii + m+1);
1842: mat->column_indices = new THRUSTINTARRAY32(nnz);
1843: mat->column_indices->assign(a->j, a->j+nnz);
1845: mat->values = new THRUSTARRAY(nnz);
1846: if (a->a) mat->values->assign(a->a, a->a+nnz);
1848: /* assign the pointer */
1849: matstruct->mat = mat;
1850: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1851: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1852: stat = cusparseCreateCsr(&matstruct->matDescr,
1853: mat->num_rows, mat->num_cols, mat->num_entries,
1854: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1855: mat->values->data().get(),
1856: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858: }
1859: #endif
1860: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1861: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1863: #else
1864: CsrMatrix *mat= new CsrMatrix;
1865: mat->num_rows = m;
1866: mat->num_cols = A->cmap->n;
1867: mat->num_entries = nnz;
1868: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1869: mat->row_offsets->assign(ii, ii + m+1);
1871: mat->column_indices = new THRUSTINTARRAY32(nnz);
1872: mat->column_indices->assign(a->j, a->j+nnz);
1874: mat->values = new THRUSTARRAY(nnz);
1875: if (a->a) mat->values->assign(a->a, a->a+nnz);
1877: cusparseHybMat_t hybMat;
1878: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1879: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1880: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1881: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1882: matstruct->descr, mat->values->data().get(),
1883: mat->row_offsets->data().get(),
1884: mat->column_indices->data().get(),
1885: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1886: /* assign the pointer */
1887: matstruct->mat = hybMat;
1889: if (mat) {
1890: if (mat->values) delete (THRUSTARRAY*)mat->values;
1891: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1892: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1893: delete (CsrMatrix*)mat;
1894: }
1895: #endif
1896: }
1898: /* assign the compressed row indices */
1899: if (a->compressedrow.use) {
1900: cusparsestruct->workVector = new THRUSTARRAY(m);
1901: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1902: matstruct->cprowIndices->assign(ridx,ridx+m);
1903: tmp = m;
1904: } else {
1905: cusparsestruct->workVector = NULL;
1906: matstruct->cprowIndices = NULL;
1907: tmp = 0;
1908: }
1909: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1911: /* assign the pointer */
1912: cusparsestruct->mat = matstruct;
1913: } catch(char *ex) {
1914: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1915: }
1916: err = WaitForCUDA();CHKERRCUDA(err);
1917: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1918: cusparsestruct->nonzerostate = A->nonzerostate;
1919: }
1920: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1921: }
1922: return(0);
1923: }
1925: struct VecCUDAPlusEquals
1926: {
1927: template <typename Tuple>
1928: __host__ __device__
1929: void operator()(Tuple t)
1930: {
1931: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1932: }
1933: };
1935: struct VecCUDAEquals
1936: {
1937: template <typename Tuple>
1938: __host__ __device__
1939: void operator()(Tuple t)
1940: {
1941: thrust::get<1>(t) = thrust::get<0>(t);
1942: }
1943: };
1945: struct VecCUDAEqualsReverse
1946: {
1947: template <typename Tuple>
1948: __host__ __device__
1949: void operator()(Tuple t)
1950: {
1951: thrust::get<0>(t) = thrust::get<1>(t);
1952: }
1953: };
1955: struct MatMatCusparse {
1956: PetscBool cisdense;
1957: PetscScalar *Bt;
1958: Mat X;
1959: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1960: PetscLogDouble flops;
1961: CsrMatrix *Bcsr;
1963: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964: cusparseSpMatDescr_t matSpBDescr;
1965: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1966: cusparseDnMatDescr_t matBDescr;
1967: cusparseDnMatDescr_t matCDescr;
1968: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1969: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1970: void *dBuffer4;
1971: void *dBuffer5;
1972: #endif
1973: size_t mmBufferSize;
1974: void *mmBuffer;
1975: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1976: cusparseSpGEMMDescr_t spgemmDesc;
1977: #endif
1978: };
1980: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1981: {
1982: PetscErrorCode ierr;
1983: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1984: cudaError_t cerr;
1985: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1986: cusparseStatus_t stat;
1987: #endif
1990: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1991: delete mmdata->Bcsr;
1992: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1993: if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1994: if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1995: if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1996: if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1997: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1998: if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1999: if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2000: #endif
2001: if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2002: if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2003: #endif
2004: MatDestroy(&mmdata->X);
2005: PetscFree(data);
2006: return(0);
2007: }
2009: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2011: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2012: {
2013: Mat_Product *product = C->product;
2014: Mat A,B;
2015: PetscInt m,n,blda,clda;
2016: PetscBool flg,biscuda;
2017: Mat_SeqAIJCUSPARSE *cusp;
2018: cusparseStatus_t stat;
2019: cusparseOperation_t opA;
2020: const PetscScalar *barray;
2021: PetscScalar *carray;
2022: PetscErrorCode ierr;
2023: MatMatCusparse *mmdata;
2024: Mat_SeqAIJCUSPARSEMultStruct *mat;
2025: CsrMatrix *csrmat;
2028: MatCheckProduct(C,1);
2029: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2030: mmdata = (MatMatCusparse*)product->data;
2031: A = product->A;
2032: B = product->B;
2033: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2034: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2035: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2036: Instead of silently accepting the wrong answer, I prefer to raise the error */
2037: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2038: MatSeqAIJCUSPARSECopyToGPU(A);
2039: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2040: switch (product->type) {
2041: case MATPRODUCT_AB:
2042: case MATPRODUCT_PtAP:
2043: mat = cusp->mat;
2044: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2045: m = A->rmap->n;
2046: n = B->cmap->n;
2047: break;
2048: case MATPRODUCT_AtB:
2049: if (!A->form_explicit_transpose) {
2050: mat = cusp->mat;
2051: opA = CUSPARSE_OPERATION_TRANSPOSE;
2052: } else {
2053: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2054: mat = cusp->matTranspose;
2055: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056: }
2057: m = A->cmap->n;
2058: n = B->cmap->n;
2059: break;
2060: case MATPRODUCT_ABt:
2061: case MATPRODUCT_RARt:
2062: mat = cusp->mat;
2063: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2064: m = A->rmap->n;
2065: n = B->rmap->n;
2066: break;
2067: default:
2068: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2069: }
2070: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2071: csrmat = (CsrMatrix*)mat->mat;
2072: /* if the user passed a CPU matrix, copy the data to the GPU */
2073: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2074: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2075: MatDenseCUDAGetArrayRead(B,&barray);
2077: MatDenseGetLDA(B,&blda);
2078: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2079: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2080: MatDenseGetLDA(mmdata->X,&clda);
2081: } else {
2082: MatDenseCUDAGetArrayWrite(C,&carray);
2083: MatDenseGetLDA(C,&clda);
2084: }
2086: PetscLogGpuTimeBegin();
2087: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2089: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2090: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2091: size_t mmBufferSize;
2092: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2093: if (!mmdata->matBDescr) {
2094: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2095: mmdata->Blda = blda;
2096: }
2098: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2099: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2100: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2101: mmdata->Clda = clda;
2102: }
2104: if (!mat->matDescr) {
2105: stat = cusparseCreateCsr(&mat->matDescr,
2106: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2107: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2108: csrmat->values->data().get(),
2109: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2110: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2111: }
2112: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2113: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2114: mmdata->matCDescr,cusparse_scalartype,
2115: cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2116: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2117: cudaError_t cerr;
2118: cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2119: cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2120: mmdata->mmBufferSize = mmBufferSize;
2121: }
2122: mmdata->initialized = PETSC_TRUE;
2123: } else {
2124: /* to be safe, always update pointers of the mats */
2125: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2126: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2127: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2128: }
2130: /* do cusparseSpMM, which supports transpose on B */
2131: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2132: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2133: mmdata->matCDescr,cusparse_scalartype,
2134: cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2135: #else
2136: PetscInt k;
2137: /* cusparseXcsrmm does not support transpose on B */
2138: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2139: cublasHandle_t cublasv2handle;
2140: cublasStatus_t cerr;
2142: PetscCUBLASGetHandle(&cublasv2handle);
2143: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2144: B->cmap->n,B->rmap->n,
2145: &PETSC_CUSPARSE_ONE ,barray,blda,
2146: &PETSC_CUSPARSE_ZERO,barray,blda,
2147: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2148: blda = B->cmap->n;
2149: k = B->cmap->n;
2150: } else {
2151: k = B->rmap->n;
2152: }
2154: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2155: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2156: csrmat->num_entries,mat->alpha_one,mat->descr,
2157: csrmat->values->data().get(),
2158: csrmat->row_offsets->data().get(),
2159: csrmat->column_indices->data().get(),
2160: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2161: carray,clda);CHKERRCUSPARSE(stat);
2162: #endif
2163: PetscLogGpuTimeEnd();
2164: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2165: MatDenseCUDARestoreArrayRead(B,&barray);
2166: if (product->type == MATPRODUCT_RARt) {
2167: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2168: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2169: } else if (product->type == MATPRODUCT_PtAP) {
2170: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2171: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2172: } else {
2173: MatDenseCUDARestoreArrayWrite(C,&carray);
2174: }
2175: if (mmdata->cisdense) {
2176: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2177: }
2178: if (!biscuda) {
2179: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2180: }
2181: return(0);
2182: }
2184: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2185: {
2186: Mat_Product *product = C->product;
2187: Mat A,B;
2188: PetscInt m,n;
2189: PetscBool cisdense,flg;
2190: PetscErrorCode ierr;
2191: MatMatCusparse *mmdata;
2192: Mat_SeqAIJCUSPARSE *cusp;
2195: MatCheckProduct(C,1);
2196: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2197: A = product->A;
2198: B = product->B;
2199: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2200: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2201: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2202: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2203: switch (product->type) {
2204: case MATPRODUCT_AB:
2205: m = A->rmap->n;
2206: n = B->cmap->n;
2207: break;
2208: case MATPRODUCT_AtB:
2209: m = A->cmap->n;
2210: n = B->cmap->n;
2211: break;
2212: case MATPRODUCT_ABt:
2213: m = A->rmap->n;
2214: n = B->rmap->n;
2215: break;
2216: case MATPRODUCT_PtAP:
2217: m = B->cmap->n;
2218: n = B->cmap->n;
2219: break;
2220: case MATPRODUCT_RARt:
2221: m = B->rmap->n;
2222: n = B->rmap->n;
2223: break;
2224: default:
2225: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2226: }
2227: MatSetSizes(C,m,n,m,n);
2228: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2229: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2230: MatSetType(C,MATSEQDENSECUDA);
2232: /* product data */
2233: PetscNew(&mmdata);
2234: mmdata->cisdense = cisdense;
2235: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2236: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2237: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2238: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2239: }
2240: #endif
2241: /* for these products we need intermediate storage */
2242: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2243: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2244: MatSetType(mmdata->X,MATSEQDENSECUDA);
2245: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2246: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2247: } else {
2248: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2249: }
2250: }
2251: C->product->data = mmdata;
2252: C->product->destroy = MatDestroy_MatMatCusparse;
2254: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2255: return(0);
2256: }
2258: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2259: {
2260: Mat_Product *product = C->product;
2261: Mat A,B;
2262: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2263: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
2264: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2265: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2266: PetscBool flg;
2267: PetscErrorCode ierr;
2268: cusparseStatus_t stat;
2269: cudaError_t cerr;
2270: MatProductType ptype;
2271: MatMatCusparse *mmdata;
2272: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273: cusparseSpMatDescr_t BmatSpDescr;
2274: #endif
2275: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2278: MatCheckProduct(C,1);
2279: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2280: PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2281: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2282: mmdata = (MatMatCusparse*)C->product->data;
2283: A = product->A;
2284: B = product->B;
2285: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2286: mmdata->reusesym = PETSC_FALSE;
2287: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2289: Cmat = Ccusp->mat;
2290: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2291: Ccsr = (CsrMatrix*)Cmat->mat;
2292: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2293: goto finalize;
2294: }
2295: if (!c->nz) goto finalize;
2296: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2297: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2299: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2300: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2301: if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2302: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2303: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2304: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2305: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2306: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2307: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2308: MatSeqAIJCUSPARSECopyToGPU(A);
2309: MatSeqAIJCUSPARSECopyToGPU(B);
2311: ptype = product->type;
2312: if (A->symmetric && ptype == MATPRODUCT_AtB) {
2313: ptype = MATPRODUCT_AB;
2314: if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2315: }
2316: if (B->symmetric && ptype == MATPRODUCT_ABt) {
2317: ptype = MATPRODUCT_AB;
2318: if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2319: }
2320: switch (ptype) {
2321: case MATPRODUCT_AB:
2322: Amat = Acusp->mat;
2323: Bmat = Bcusp->mat;
2324: break;
2325: case MATPRODUCT_AtB:
2326: Amat = Acusp->matTranspose;
2327: Bmat = Bcusp->mat;
2328: break;
2329: case MATPRODUCT_ABt:
2330: Amat = Acusp->mat;
2331: Bmat = Bcusp->matTranspose;
2332: break;
2333: default:
2334: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2335: }
2336: Cmat = Ccusp->mat;
2337: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2338: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2339: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2340: Acsr = (CsrMatrix*)Amat->mat;
2341: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2342: Ccsr = (CsrMatrix*)Cmat->mat;
2343: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2344: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2345: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2346: PetscLogGpuTimeBegin();
2347: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2348: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2349: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2350: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2351: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2352: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2353: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2354: mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2355: #else
2356: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2357: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2358: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2359: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2360: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2361: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2362: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2363: #endif
2364: #else
2365: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2366: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2367: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2368: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2369: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2370: #endif
2371: PetscLogGpuFlops(mmdata->flops);
2372: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2373: PetscLogGpuTimeEnd();
2374: C->offloadmask = PETSC_OFFLOAD_GPU;
2375: finalize:
2376: /* shorter version of MatAssemblyEnd_SeqAIJ */
2377: PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2378: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2379: PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2380: c->reallocs = 0;
2381: C->info.mallocs += 0;
2382: C->info.nz_unneeded = 0;
2383: C->assembled = C->was_assembled = PETSC_TRUE;
2384: C->num_ass++;
2385: return(0);
2386: }
2388: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2389: {
2390: Mat_Product *product = C->product;
2391: Mat A,B;
2392: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2393: Mat_SeqAIJ *a,*b,*c;
2394: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2395: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2396: PetscInt i,j,m,n,k;
2397: PetscBool flg;
2398: PetscErrorCode ierr;
2399: cusparseStatus_t stat;
2400: cudaError_t cerr;
2401: MatProductType ptype;
2402: MatMatCusparse *mmdata;
2403: PetscLogDouble flops;
2404: PetscBool biscompressed,ciscompressed;
2405: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2406: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2407: cusparseSpMatDescr_t BmatSpDescr;
2408: #else
2409: int cnz;
2410: #endif
2411: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2414: MatCheckProduct(C,1);
2415: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2416: A = product->A;
2417: B = product->B;
2418: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2419: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2420: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2421: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2422: a = (Mat_SeqAIJ*)A->data;
2423: b = (Mat_SeqAIJ*)B->data;
2424: /* product data */
2425: PetscNew(&mmdata);
2426: C->product->data = mmdata;
2427: C->product->destroy = MatDestroy_MatMatCusparse;
2429: MatSeqAIJCUSPARSECopyToGPU(A);
2430: MatSeqAIJCUSPARSECopyToGPU(B);
2431: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2432: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2433: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2434: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2436: ptype = product->type;
2437: if (A->symmetric && ptype == MATPRODUCT_AtB) {
2438: ptype = MATPRODUCT_AB;
2439: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2440: }
2441: if (B->symmetric && ptype == MATPRODUCT_ABt) {
2442: ptype = MATPRODUCT_AB;
2443: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2444: }
2445: biscompressed = PETSC_FALSE;
2446: ciscompressed = PETSC_FALSE;
2447: switch (ptype) {
2448: case MATPRODUCT_AB:
2449: m = A->rmap->n;
2450: n = B->cmap->n;
2451: k = A->cmap->n;
2452: Amat = Acusp->mat;
2453: Bmat = Bcusp->mat;
2454: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2455: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2456: break;
2457: case MATPRODUCT_AtB:
2458: m = A->cmap->n;
2459: n = B->cmap->n;
2460: k = A->rmap->n;
2461: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2462: Amat = Acusp->matTranspose;
2463: Bmat = Bcusp->mat;
2464: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2465: break;
2466: case MATPRODUCT_ABt:
2467: m = A->rmap->n;
2468: n = B->rmap->n;
2469: k = A->cmap->n;
2470: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2471: Amat = Acusp->mat;
2472: Bmat = Bcusp->matTranspose;
2473: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2474: break;
2475: default:
2476: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2477: }
2479: /* create cusparse matrix */
2480: MatSetSizes(C,m,n,m,n);
2481: MatSetType(C,MATSEQAIJCUSPARSE);
2482: c = (Mat_SeqAIJ*)C->data;
2483: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2484: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2485: Ccsr = new CsrMatrix;
2487: c->compressedrow.use = ciscompressed;
2488: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2489: c->compressedrow.nrows = a->compressedrow.nrows;
2490: PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2491: PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2492: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2493: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2494: Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2495: } else {
2496: c->compressedrow.nrows = 0;
2497: c->compressedrow.i = NULL;
2498: c->compressedrow.rindex = NULL;
2499: Ccusp->workVector = NULL;
2500: Cmat->cprowIndices = NULL;
2501: }
2502: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2503: Ccusp->mat = Cmat;
2504: Ccusp->mat->mat = Ccsr;
2505: Ccsr->num_rows = Ccusp->nrows;
2506: Ccsr->num_cols = n;
2507: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2508: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2509: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2510: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2511: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2512: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2513: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2514: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2515: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2516: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2517: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2518: thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2519: c->nz = 0;
2520: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2521: Ccsr->values = new THRUSTARRAY(c->nz);
2522: goto finalizesym;
2523: }
2525: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2526: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2527: Acsr = (CsrMatrix*)Amat->mat;
2528: if (!biscompressed) {
2529: Bcsr = (CsrMatrix*)Bmat->mat;
2530: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2531: BmatSpDescr = Bmat->matDescr;
2532: #endif
2533: } else { /* we need to use row offsets for the full matrix */
2534: CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2535: Bcsr = new CsrMatrix;
2536: Bcsr->num_rows = B->rmap->n;
2537: Bcsr->num_cols = cBcsr->num_cols;
2538: Bcsr->num_entries = cBcsr->num_entries;
2539: Bcsr->column_indices = cBcsr->column_indices;
2540: Bcsr->values = cBcsr->values;
2541: if (!Bcusp->rowoffsets_gpu) {
2542: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2543: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2544: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2545: }
2546: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2547: mmdata->Bcsr = Bcsr;
2548: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549: if (Bcsr->num_rows && Bcsr->num_cols) {
2550: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2551: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2552: Bcsr->values->data().get(),
2553: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2554: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2555: }
2556: BmatSpDescr = mmdata->matSpBDescr;
2557: #endif
2558: }
2559: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2560: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2561: /* precompute flops count */
2562: if (ptype == MATPRODUCT_AB) {
2563: for (i=0, flops = 0; i<A->rmap->n; i++) {
2564: const PetscInt st = a->i[i];
2565: const PetscInt en = a->i[i+1];
2566: for (j=st; j<en; j++) {
2567: const PetscInt brow = a->j[j];
2568: flops += 2.*(b->i[brow+1] - b->i[brow]);
2569: }
2570: }
2571: } else if (ptype == MATPRODUCT_AtB) {
2572: for (i=0, flops = 0; i<A->rmap->n; i++) {
2573: const PetscInt anzi = a->i[i+1] - a->i[i];
2574: const PetscInt bnzi = b->i[i+1] - b->i[i];
2575: flops += (2.*anzi)*bnzi;
2576: }
2577: } else { /* TODO */
2578: flops = 0.;
2579: }
2581: mmdata->flops = flops;
2582: PetscLogGpuTimeBegin();
2584: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2585: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2586: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2587: NULL, NULL, NULL,
2588: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2589: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2590: stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2591: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2592: {
2593: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2594: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2595: */
2596: void* dBuffer1 = NULL;
2597: void* dBuffer2 = NULL;
2598: void* dBuffer3 = NULL;
2599: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2600: size_t bufferSize1 = 0;
2601: size_t bufferSize2 = 0;
2602: size_t bufferSize3 = 0;
2603: size_t bufferSize4 = 0;
2604: size_t bufferSize5 = 0;
2606: /*----------------------------------------------------------------------*/
2607: /* ask bufferSize1 bytes for external memory */
2608: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2609: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2610: &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2611: cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2612: /* inspect the matrices A and B to understand the memory requirement for the next step */
2613: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615: &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2617: /*----------------------------------------------------------------------*/
2618: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2619: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2620: &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2621: cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2622: cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2623: cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2624: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2625: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2626: &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2627: cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2628: cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2630: /*----------------------------------------------------------------------*/
2631: /* get matrix C non-zero entries C_nnz1 */
2632: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2633: c->nz = (PetscInt) C_nnz1;
2634: /* allocate matrix C */
2635: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2636: Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2637: /* update matC with the new pointers */
2638: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2639: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2641: /*----------------------------------------------------------------------*/
2642: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644: &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2645: cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2646: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2647: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2648: &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2649: cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2650: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2651: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2652: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2653: mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2654: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2655: }
2656: #else
2657: size_t bufSize2;
2658: /* ask bufferSize bytes for external memory */
2659: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2660: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2662: mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2663: cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2664: /* inspect the matrices A and B to understand the memory requirement for the next step */
2665: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2666: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2667: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2668: mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2669: /* ask bufferSize again bytes for external memory */
2670: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2671: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2672: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2673: mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2674: /* The CUSPARSE documentation is not clear, nor the API
2675: We need both buffers to perform the operations properly!
2676: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2677: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2678: is stored in the descriptor! What a messy API... */
2679: cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2680: /* compute the intermediate product of A * B */
2681: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2682: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2685: /* get matrix C non-zero entries C_nnz1 */
2686: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2687: c->nz = (PetscInt) C_nnz1;
2688: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2689: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2690: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2691: Ccsr->values = new THRUSTARRAY(c->nz);
2692: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2693: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2694: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2695: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2696: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2697: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2698: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2699: #else
2700: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2701: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2702: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2703: Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2704: Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2705: Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2706: c->nz = cnz;
2707: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2708: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2709: Ccsr->values = new THRUSTARRAY(c->nz);
2710: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2712: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2713: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2714: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2715: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2716: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2717: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2718: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2719: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2720: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2721: #endif
2722: PetscLogGpuFlops(mmdata->flops);
2723: PetscLogGpuTimeEnd();
2724: finalizesym:
2725: c->singlemalloc = PETSC_FALSE;
2726: c->free_a = PETSC_TRUE;
2727: c->free_ij = PETSC_TRUE;
2728: PetscMalloc1(m+1,&c->i);
2729: PetscMalloc1(c->nz,&c->j);
2730: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2731: PetscInt *d_i = c->i;
2732: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2733: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2734: ii = *Ccsr->row_offsets;
2735: jj = *Ccsr->column_indices;
2736: if (ciscompressed) d_i = c->compressedrow.i;
2737: cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2738: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2739: } else {
2740: PetscInt *d_i = c->i;
2741: if (ciscompressed) d_i = c->compressedrow.i;
2742: cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2743: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2744: }
2745: if (ciscompressed) { /* need to expand host row offsets */
2746: PetscInt r = 0;
2747: c->i[0] = 0;
2748: for (k = 0; k < c->compressedrow.nrows; k++) {
2749: const PetscInt next = c->compressedrow.rindex[k];
2750: const PetscInt old = c->compressedrow.i[k];
2751: for (; r < next; r++) c->i[r+1] = old;
2752: }
2753: for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2754: }
2755: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2756: PetscMalloc1(m,&c->ilen);
2757: PetscMalloc1(m,&c->imax);
2758: c->maxnz = c->nz;
2759: c->nonzerorowcnt = 0;
2760: c->rmax = 0;
2761: for (k = 0; k < m; k++) {
2762: const PetscInt nn = c->i[k+1] - c->i[k];
2763: c->ilen[k] = c->imax[k] = nn;
2764: c->nonzerorowcnt += (PetscInt)!!nn;
2765: c->rmax = PetscMax(c->rmax,nn);
2766: }
2767: MatMarkDiagonal_SeqAIJ(C);
2768: PetscMalloc1(c->nz,&c->a);
2769: Ccsr->num_entries = c->nz;
2771: C->nonzerostate++;
2772: PetscLayoutSetUp(C->rmap);
2773: PetscLayoutSetUp(C->cmap);
2774: Ccusp->nonzerostate = C->nonzerostate;
2775: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2776: C->preallocated = PETSC_TRUE;
2777: C->assembled = PETSC_FALSE;
2778: C->was_assembled = PETSC_FALSE;
2779: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2780: mmdata->reusesym = PETSC_TRUE;
2781: C->offloadmask = PETSC_OFFLOAD_GPU;
2782: }
2783: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2784: return(0);
2785: }
2787: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2789: /* handles sparse or dense B */
2790: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2791: {
2792: Mat_Product *product = mat->product;
2794: PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2797: MatCheckProduct(mat,1);
2798: PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2799: if (!product->A->boundtocpu && !product->B->boundtocpu) {
2800: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2801: }
2802: if (product->type == MATPRODUCT_ABC) {
2803: Ciscusp = PETSC_FALSE;
2804: if (!product->C->boundtocpu) {
2805: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2806: }
2807: }
2808: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2809: PetscBool usecpu = PETSC_FALSE;
2810: switch (product->type) {
2811: case MATPRODUCT_AB:
2812: if (product->api_user) {
2813: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2814: PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2815: PetscOptionsEnd();
2816: } else {
2817: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2818: PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2819: PetscOptionsEnd();
2820: }
2821: break;
2822: case MATPRODUCT_AtB:
2823: if (product->api_user) {
2824: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2825: PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2826: PetscOptionsEnd();
2827: } else {
2828: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2829: PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2830: PetscOptionsEnd();
2831: }
2832: break;
2833: case MATPRODUCT_PtAP:
2834: if (product->api_user) {
2835: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2836: PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2837: PetscOptionsEnd();
2838: } else {
2839: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2840: PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2841: PetscOptionsEnd();
2842: }
2843: break;
2844: case MATPRODUCT_RARt:
2845: if (product->api_user) {
2846: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2847: PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2848: PetscOptionsEnd();
2849: } else {
2850: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2851: PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2852: PetscOptionsEnd();
2853: }
2854: break;
2855: case MATPRODUCT_ABC:
2856: if (product->api_user) {
2857: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2858: PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2859: PetscOptionsEnd();
2860: } else {
2861: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2862: PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2863: PetscOptionsEnd();
2864: }
2865: break;
2866: default:
2867: break;
2868: }
2869: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2870: }
2871: /* dispatch */
2872: if (isdense) {
2873: switch (product->type) {
2874: case MATPRODUCT_AB:
2875: case MATPRODUCT_AtB:
2876: case MATPRODUCT_ABt:
2877: case MATPRODUCT_PtAP:
2878: case MATPRODUCT_RARt:
2879: if (product->A->boundtocpu) {
2880: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2881: } else {
2882: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2883: }
2884: break;
2885: case MATPRODUCT_ABC:
2886: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2887: break;
2888: default:
2889: break;
2890: }
2891: } else if (Biscusp && Ciscusp) {
2892: switch (product->type) {
2893: case MATPRODUCT_AB:
2894: case MATPRODUCT_AtB:
2895: case MATPRODUCT_ABt:
2896: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2897: break;
2898: case MATPRODUCT_PtAP:
2899: case MATPRODUCT_RARt:
2900: case MATPRODUCT_ABC:
2901: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2902: break;
2903: default:
2904: break;
2905: }
2906: } else { /* fallback for AIJ */
2907: MatProductSetFromOptions_SeqAIJ(mat);
2908: }
2909: return(0);
2910: }
2912: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2913: {
2917: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2918: return(0);
2919: }
2921: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2922: {
2926: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2927: return(0);
2928: }
2930: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2931: {
2935: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2936: return(0);
2937: }
2939: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940: {
2944: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2945: return(0);
2946: }
2948: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2949: {
2953: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2954: return(0);
2955: }
2957: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2958: {
2959: int i = blockIdx.x*blockDim.x + threadIdx.x;
2960: if (i < n) y[idx[i]] += x[i];
2961: }
2963: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2964: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2965: {
2966: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2967: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2969: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2970: PetscErrorCode ierr;
2971: cusparseStatus_t stat;
2972: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2973: PetscBool compressed;
2974: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2975: PetscInt nx,ny;
2976: #endif
2979: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2980: if (!a->nonzerorowcnt) {
2981: if (!yy) {VecSet_SeqCUDA(zz,0);}
2982: else {VecCopy_SeqCUDA(yy,zz);}
2983: return(0);
2984: }
2985: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2986: MatSeqAIJCUSPARSECopyToGPU(A);
2987: if (!trans) {
2988: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2989: if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2990: } else {
2991: if (herm || !A->form_explicit_transpose) {
2992: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2993: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2994: } else {
2995: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTranspose(A);}
2996: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2997: }
2998: }
2999: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3000: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3002: try {
3003: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
3004: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
3005: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
3007: PetscLogGpuTimeBegin();
3008: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3009: /* z = A x + beta y.
3010: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3011: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3012: */
3013: xptr = xarray;
3014: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3015: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3016: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3017: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3018: allocated to accommodate different uses. So we get the length info directly from mat.
3019: */
3020: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3021: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3022: nx = mat->num_cols;
3023: ny = mat->num_rows;
3024: }
3025: #endif
3026: } else {
3027: /* z = A^T x + beta y
3028: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3029: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3030: */
3031: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3032: dptr = zarray;
3033: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3034: if (compressed) { /* Scatter x to work vector */
3035: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3036: thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3037: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3038: VecCUDAEqualsReverse());
3039: }
3040: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3041: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043: nx = mat->num_rows;
3044: ny = mat->num_cols;
3045: }
3046: #endif
3047: }
3049: /* csr_spmv does y = alpha op(A) x + beta y */
3050: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3051: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3052: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3053: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3054: cudaError_t cerr;
3055: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3056: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3057: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3058: matstruct->matDescr,
3059: matstruct->cuSpMV[opA].vecXDescr, beta,
3060: matstruct->cuSpMV[opA].vecYDescr,
3061: cusparse_scalartype,
3062: cusparsestruct->spmvAlg,
3063: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3064: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3066: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3067: } else {
3068: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3069: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3070: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3071: }
3073: stat = cusparseSpMV(cusparsestruct->handle, opA,
3074: matstruct->alpha_one,
3075: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3076: matstruct->cuSpMV[opA].vecXDescr,
3077: beta,
3078: matstruct->cuSpMV[opA].vecYDescr,
3079: cusparse_scalartype,
3080: cusparsestruct->spmvAlg,
3081: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3082: #else
3083: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3084: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3085: mat->num_rows, mat->num_cols,
3086: mat->num_entries, matstruct->alpha_one, matstruct->descr,
3087: mat->values->data().get(), mat->row_offsets->data().get(),
3088: mat->column_indices->data().get(), xptr, beta,
3089: dptr);CHKERRCUSPARSE(stat);
3090: #endif
3091: } else {
3092: if (cusparsestruct->nrows) {
3093: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3095: #else
3096: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3097: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3098: matstruct->alpha_one, matstruct->descr, hybMat,
3099: xptr, beta,
3100: dptr);CHKERRCUSPARSE(stat);
3101: #endif
3102: }
3103: }
3104: PetscLogGpuTimeEnd();
3106: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3108: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3109: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3110: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3111: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3112: }
3113: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3114: VecSet_SeqCUDA(zz,0);
3115: }
3117: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3118: if (compressed) {
3119: PetscLogGpuTimeBegin();
3120: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3121: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3122: prevent that. So I just add a ScatterAdd kernel.
3123: */
3124: #if 0
3125: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3126: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3127: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3128: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3129: VecCUDAPlusEquals());
3130: #else
3131: PetscInt n = matstruct->cprowIndices->size();
3132: ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3133: #endif
3134: PetscLogGpuTimeEnd();
3135: }
3136: } else {
3137: if (yy && yy != zz) {
3138: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3139: }
3140: }
3141: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3142: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3143: else {VecCUDARestoreArrayWrite(zz,&zarray);}
3144: } catch(char *ex) {
3145: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3146: }
3147: if (yy) {
3148: PetscLogGpuFlops(2.0*a->nz);
3149: } else {
3150: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3151: }
3152: return(0);
3153: }
3155: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3156: {
3160: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3161: return(0);
3162: }
3164: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3165: {
3166: PetscErrorCode ierr;
3167: PetscObjectState onnz = A->nonzerostate;
3168: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3171: MatAssemblyEnd_SeqAIJ(A,mode);
3172: if (onnz != A->nonzerostate && cusp->deviceMat) {
3173: cudaError_t cerr;
3175: PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3176: cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3177: cusp->deviceMat = NULL;
3178: }
3179: return(0);
3180: }
3182: /* --------------------------------------------------------------------------------*/
3183: /*@
3184: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3185: (the default parallel PETSc format). This matrix will ultimately pushed down
3186: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3187: assembly performance the user should preallocate the matrix storage by setting
3188: the parameter nz (or the array nnz). By setting these parameters accurately,
3189: performance during matrix assembly can be increased by more than a factor of 50.
3191: Collective
3193: Input Parameters:
3194: + comm - MPI communicator, set to PETSC_COMM_SELF
3195: . m - number of rows
3196: . n - number of columns
3197: . nz - number of nonzeros per row (same for all rows)
3198: - nnz - array containing the number of nonzeros in the various rows
3199: (possibly different for each row) or NULL
3201: Output Parameter:
3202: . A - the matrix
3204: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3205: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3206: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3208: Notes:
3209: If nnz is given then nz is ignored
3211: The AIJ format (also called the Yale sparse matrix format or
3212: compressed row storage), is fully compatible with standard Fortran 77
3213: storage. That is, the stored row and column indices can begin at
3214: either one (as in Fortran) or zero. See the users' manual for details.
3216: Specify the preallocated storage with either nz or nnz (not both).
3217: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3218: allocation. For large problems you MUST preallocate memory or you
3219: will get TERRIBLE performance, see the users' manual chapter on matrices.
3221: By default, this format uses inodes (identical nodes) when possible, to
3222: improve numerical efficiency of matrix-vector products and solves. We
3223: search for consecutive rows with the same nonzero structure, thereby
3224: reusing matrix information to achieve increased efficiency.
3226: Level: intermediate
3228: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3229: @*/
3230: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3231: {
3235: MatCreate(comm,A);
3236: MatSetSizes(*A,m,n,m,n);
3237: MatSetType(*A,MATSEQAIJCUSPARSE);
3238: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3239: return(0);
3240: }
3242: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3243: {
3247: if (A->factortype == MAT_FACTOR_NONE) {
3248: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3249: } else {
3250: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3251: }
3252: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3253: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3254: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3255: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3256: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3257: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3258: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3259: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3260: PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3261: MatDestroy_SeqAIJ(A);
3262: return(0);
3263: }
3265: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3266: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3267: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3268: {
3272: MatDuplicate_SeqAIJ(A,cpvalues,B);
3273: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3274: return(0);
3275: }
3277: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3278: {
3279: PetscErrorCode ierr;
3280: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3281: Mat_SeqAIJCUSPARSE *cy;
3282: Mat_SeqAIJCUSPARSE *cx;
3283: PetscScalar *ay;
3284: const PetscScalar *ax;
3285: CsrMatrix *csry,*csrx;
3288: cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3289: cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3290: if (X->ops->axpy != Y->ops->axpy) {
3291: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3292: MatAXPY_SeqAIJ(Y,a,X,str);
3293: return(0);
3294: }
3295: /* if we are here, it means both matrices are bound to GPU */
3296: MatSeqAIJCUSPARSECopyToGPU(Y);
3297: MatSeqAIJCUSPARSECopyToGPU(X);
3298: if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3299: if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3300: csry = (CsrMatrix*)cy->mat->mat;
3301: csrx = (CsrMatrix*)cx->mat->mat;
3302: /* see if we can turn this into a cublas axpy */
3303: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3304: bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3305: if (eq) {
3306: eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3307: }
3308: if (eq) str = SAME_NONZERO_PATTERN;
3309: }
3310: /* spgeam is buggy with one column */
3311: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3313: if (str == SUBSET_NONZERO_PATTERN) {
3314: cusparseStatus_t stat;
3315: PetscScalar b = 1.0;
3316: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3317: size_t bufferSize;
3318: void *buffer;
3319: cudaError_t cerr;
3320: #endif
3322: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3323: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3324: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3325: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326: stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3327: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3328: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3329: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3330: cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3331: PetscLogGpuTimeBegin();
3332: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3333: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3334: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3335: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3336: PetscLogGpuFlops(x->nz + y->nz);
3337: PetscLogGpuTimeEnd();
3338: cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3339: #else
3340: PetscLogGpuTimeBegin();
3341: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3342: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3343: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3344: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3345: PetscLogGpuFlops(x->nz + y->nz);
3346: PetscLogGpuTimeEnd();
3347: #endif
3348: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3349: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3350: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3351: MatSeqAIJInvalidateDiagonal(Y);
3352: } else if (str == SAME_NONZERO_PATTERN) {
3353: cublasHandle_t cublasv2handle;
3354: cublasStatus_t berr;
3355: PetscBLASInt one = 1, bnz = 1;
3357: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3358: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3359: PetscCUBLASGetHandle(&cublasv2handle);
3360: PetscBLASIntCast(x->nz,&bnz);
3361: PetscLogGpuTimeBegin();
3362: berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3363: PetscLogGpuFlops(2.0*bnz);
3364: PetscLogGpuTimeEnd();
3365: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3366: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3367: MatSeqAIJInvalidateDiagonal(Y);
3368: } else {
3369: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3370: MatAXPY_SeqAIJ(Y,a,X,str);
3371: }
3372: return(0);
3373: }
3375: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3376: {
3378: Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data;
3379: PetscScalar *ay;
3380: cublasHandle_t cublasv2handle;
3381: cublasStatus_t berr;
3382: PetscBLASInt one = 1, bnz = 1;
3385: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3386: PetscCUBLASGetHandle(&cublasv2handle);
3387: PetscBLASIntCast(y->nz,&bnz);
3388: PetscLogGpuTimeBegin();
3389: berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3390: PetscLogGpuFlops(bnz);
3391: PetscLogGpuTimeEnd();
3392: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3393: MatSeqAIJInvalidateDiagonal(Y);
3394: return(0);
3395: }
3397: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3398: {
3400: PetscBool both = PETSC_FALSE;
3401: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3404: if (A->factortype == MAT_FACTOR_NONE) {
3405: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3406: if (spptr->mat) {
3407: CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3408: if (matrix->values) {
3409: both = PETSC_TRUE;
3410: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3411: }
3412: }
3413: if (spptr->matTranspose) {
3414: CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3415: if (matrix->values) {
3416: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3417: }
3418: }
3419: }
3420: //MatZeroEntries_SeqAIJ(A);
3421: PetscArrayzero(a->a,a->i[A->rmap->n]);
3422: MatSeqAIJInvalidateDiagonal(A);
3423: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424: else A->offloadmask = PETSC_OFFLOAD_CPU;
3425: return(0);
3426: }
3428: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429: {
3430: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3434: if (A->factortype != MAT_FACTOR_NONE) return(0);
3435: if (flg) {
3436: MatSeqAIJCUSPARSECopyFromGPU(A);
3438: A->ops->scale = MatScale_SeqAIJ;
3439: A->ops->axpy = MatAXPY_SeqAIJ;
3440: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3441: A->ops->mult = MatMult_SeqAIJ;
3442: A->ops->multadd = MatMultAdd_SeqAIJ;
3443: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3444: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3445: A->ops->multhermitiantranspose = NULL;
3446: A->ops->multhermitiantransposeadd = NULL;
3447: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3448: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3449: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3450: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3451: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3452: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3453: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3454: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3455: } else {
3456: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3457: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3458: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3459: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3460: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3461: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3462: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3463: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3464: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3465: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3466: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3467: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3468: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3469: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3470: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3471: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3472: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3473: }
3474: A->boundtocpu = flg;
3475: a->inode.use = flg;
3476: return(0);
3477: }
3479: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3480: {
3481: PetscErrorCode ierr;
3482: cusparseStatus_t stat;
3483: Mat B;
3486: PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3487: if (reuse == MAT_INITIAL_MATRIX) {
3488: MatDuplicate(A,MAT_COPY_VALUES,newmat);
3489: } else if (reuse == MAT_REUSE_MATRIX) {
3490: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3491: }
3492: B = *newmat;
3494: PetscFree(B->defaultvectype);
3495: PetscStrallocpy(VECCUDA,&B->defaultvectype);
3497: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3498: if (B->factortype == MAT_FACTOR_NONE) {
3499: Mat_SeqAIJCUSPARSE *spptr;
3500: PetscNew(&spptr);
3501: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3502: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3503: spptr->format = MAT_CUSPARSE_CSR;
3504: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3505: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3506: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3507: #else
3508: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3509: #endif
3510: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3511: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3512: #endif
3513: B->spptr = spptr;
3514: } else {
3515: Mat_SeqAIJCUSPARSETriFactors *spptr;
3517: PetscNew(&spptr);
3518: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3519: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3520: B->spptr = spptr;
3521: }
3522: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3523: }
3524: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3525: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3526: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3527: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3528: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3529: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3531: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3532: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3533: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3534: #if defined(PETSC_HAVE_HYPRE)
3535: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3536: #endif
3537: return(0);
3538: }
3540: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3541: {
3545: MatCreate_SeqAIJ(B);
3546: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3547: return(0);
3548: }
3550: /*MC
3551: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3553: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3554: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3555: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3557: Options Database Keys:
3558: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3559: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3560: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3562: Level: beginner
3564: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3565: M*/
3567: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3569: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3570: {
3574: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3575: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3576: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3577: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3578: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
3580: return(0);
3581: }
3583: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3584: {
3585: PetscErrorCode ierr;
3586: cusparseStatus_t stat;
3589: if (*cusparsestruct) {
3590: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3591: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3592: delete (*cusparsestruct)->workVector;
3593: delete (*cusparsestruct)->rowoffsets_gpu;
3594: delete (*cusparsestruct)->cooPerm;
3595: delete (*cusparsestruct)->cooPerm_a;
3596: delete (*cusparsestruct)->csr2csc_i;
3597: if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3598: PetscFree(*cusparsestruct);
3599: }
3600: return(0);
3601: }
3603: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3604: {
3606: if (*mat) {
3607: delete (*mat)->values;
3608: delete (*mat)->column_indices;
3609: delete (*mat)->row_offsets;
3610: delete *mat;
3611: *mat = 0;
3612: }
3613: return(0);
3614: }
3616: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3617: {
3618: cusparseStatus_t stat;
3619: PetscErrorCode ierr;
3622: if (*trifactor) {
3623: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3624: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3625: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3626: if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3627: if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3628: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3629: if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3630: #endif
3631: PetscFree(*trifactor);
3632: }
3633: return(0);
3634: }
3636: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3637: {
3638: CsrMatrix *mat;
3639: cusparseStatus_t stat;
3640: cudaError_t err;
3643: if (*matstruct) {
3644: if ((*matstruct)->mat) {
3645: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3646: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3647: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3648: #else
3649: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3650: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3651: #endif
3652: } else {
3653: mat = (CsrMatrix*)(*matstruct)->mat;
3654: CsrMatrix_Destroy(&mat);
3655: }
3656: }
3657: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3658: delete (*matstruct)->cprowIndices;
3659: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3660: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3661: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3663: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3665: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3666: for (int i=0; i<3; i++) {
3667: if (mdata->cuSpMV[i].initialized) {
3668: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3669: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3670: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3671: }
3672: }
3673: #endif
3674: delete *matstruct;
3675: *matstruct = NULL;
3676: }
3677: return(0);
3678: }
3680: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3681: {
3685: if (*trifactors) {
3686: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3687: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3688: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3689: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3690: delete (*trifactors)->rpermIndices;
3691: delete (*trifactors)->cpermIndices;
3692: delete (*trifactors)->workVector;
3693: (*trifactors)->rpermIndices = NULL;
3694: (*trifactors)->cpermIndices = NULL;
3695: (*trifactors)->workVector = NULL;
3696: if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3697: if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3698: (*trifactors)->init_dev_prop = PETSC_FALSE;
3699: }
3700: return(0);
3701: }
3703: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3704: {
3705: PetscErrorCode ierr;
3706: cusparseHandle_t handle;
3707: cusparseStatus_t stat;
3710: if (*trifactors) {
3711: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3712: if (handle = (*trifactors)->handle) {
3713: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3714: }
3715: PetscFree(*trifactors);
3716: }
3717: return(0);
3718: }
3720: struct IJCompare
3721: {
3722: __host__ __device__
3723: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3724: {
3725: if (t1.get<0>() < t2.get<0>()) return true;
3726: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3727: return false;
3728: }
3729: };
3731: struct IJEqual
3732: {
3733: __host__ __device__
3734: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3735: {
3736: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3737: return true;
3738: }
3739: };
3741: struct IJDiff
3742: {
3743: __host__ __device__
3744: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3745: {
3746: return t1 == t2 ? 0 : 1;
3747: }
3748: };
3750: struct IJSum
3751: {
3752: __host__ __device__
3753: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3754: {
3755: return t1||t2;
3756: }
3757: };
3759: #include <thrust/iterator/discard_iterator.h>
3760: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3761: {
3762: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3763: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3764: THRUSTARRAY *cooPerm_v = NULL;
3765: thrust::device_ptr<const PetscScalar> d_v;
3766: CsrMatrix *matrix;
3767: PetscErrorCode ierr;
3768: PetscInt n;
3771: if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3772: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3773: if (!cusp->cooPerm) {
3774: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3775: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3776: return(0);
3777: }
3778: matrix = (CsrMatrix*)cusp->mat->mat;
3779: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780: if (!v) {
3781: if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3782: goto finalize;
3783: }
3784: n = cusp->cooPerm->size();
3785: if (isCudaMem(v)) {
3786: d_v = thrust::device_pointer_cast(v);
3787: } else {
3788: cooPerm_v = new THRUSTARRAY(n);
3789: cooPerm_v->assign(v,v+n);
3790: d_v = cooPerm_v->data();
3791: PetscLogCpuToGpu(n*sizeof(PetscScalar));
3792: }
3793: PetscLogGpuTimeBegin();
3794: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3795: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3796: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3797: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3798: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3799: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3800: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3801: */
3802: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3803: thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3804: delete cooPerm_w;
3805: } else {
3806: /* all nonzeros in d_v[] are unique entries */
3807: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3808: matrix->values->begin()));
3809: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3810: matrix->values->end()));
3811: thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3812: }
3813: } else {
3814: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3815: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3816: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3817: } else {
3818: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3819: matrix->values->begin()));
3820: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3821: matrix->values->end()));
3822: thrust::for_each(zibit,zieit,VecCUDAEquals());
3823: }
3824: }
3825: PetscLogGpuTimeEnd();
3826: finalize:
3827: delete cooPerm_v;
3828: A->offloadmask = PETSC_OFFLOAD_GPU;
3829: PetscObjectStateIncrease((PetscObject)A);
3830: /* shorter version of MatAssemblyEnd_SeqAIJ */
3831: PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3832: PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3833: PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3834: a->reallocs = 0;
3835: A->info.mallocs += 0;
3836: A->info.nz_unneeded = 0;
3837: A->assembled = A->was_assembled = PETSC_TRUE;
3838: A->num_ass++;
3839: return(0);
3840: }
3842: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3843: {
3844: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845: PetscErrorCode ierr;
3849: if (!cusp) return(0);
3850: if (destroy) {
3851: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3852: delete cusp->csr2csc_i;
3853: cusp->csr2csc_i = NULL;
3854: }
3855: A->transupdated = PETSC_FALSE;
3856: return(0);
3857: }
3859: #include <thrust/binary_search.h>
3860: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3861: {
3862: PetscErrorCode ierr;
3863: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3864: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3865: PetscInt cooPerm_n, nzr = 0;
3866: cudaError_t cerr;
3869: PetscLayoutSetUp(A->rmap);
3870: PetscLayoutSetUp(A->cmap);
3871: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3872: if (n != cooPerm_n) {
3873: delete cusp->cooPerm;
3874: delete cusp->cooPerm_a;
3875: cusp->cooPerm = NULL;
3876: cusp->cooPerm_a = NULL;
3877: }
3878: if (n) {
3879: THRUSTINTARRAY d_i(n);
3880: THRUSTINTARRAY d_j(n);
3881: THRUSTINTARRAY ii(A->rmap->n);
3883: if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
3884: if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3886: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3887: d_i.assign(coo_i,coo_i+n);
3888: d_j.assign(coo_j,coo_j+n);
3890: /* Ex.
3891: n = 6
3892: coo_i = [3,3,1,4,1,4]
3893: coo_j = [3,2,2,5,2,6]
3894: */
3895: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3896: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3898: PetscLogGpuTimeBegin();
3899: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3900: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3901: *cusp->cooPerm_a = d_i; /* copy the sorted array */
3902: THRUSTINTARRAY w = d_j;
3904: /*
3905: d_i = [1,1,3,3,4,4]
3906: d_j = [2,2,2,3,5,6]
3907: cooPerm = [2,4,1,0,3,5]
3908: */
3909: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3911: /*
3912: d_i = [1,3,3,4,4,x]
3913: ^ekey
3914: d_j = [2,2,3,5,6,x]
3915: ^nekye
3916: */
3917: if (nekey == ekey) { /* all entries are unique */
3918: delete cusp->cooPerm_a;
3919: cusp->cooPerm_a = NULL;
3920: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3921: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3922: adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3923: adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3924: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3925: w[0] = 0;
3926: thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
3927: thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3928: }
3929: thrust::counting_iterator<PetscInt> search_begin(0);
3930: thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3931: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3932: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3933: PetscLogGpuTimeEnd();
3935: MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3936: a->singlemalloc = PETSC_FALSE;
3937: a->free_a = PETSC_TRUE;
3938: a->free_ij = PETSC_TRUE;
3939: PetscMalloc1(A->rmap->n+1,&a->i);
3940: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3941: cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3942: a->nz = a->maxnz = a->i[A->rmap->n];
3943: a->rmax = 0;
3944: PetscMalloc1(a->nz,&a->a);
3945: PetscMalloc1(a->nz,&a->j);
3946: cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3947: if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3948: if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3949: for (PetscInt i = 0; i < A->rmap->n; i++) {
3950: const PetscInt nnzr = a->i[i+1] - a->i[i];
3951: nzr += (PetscInt)!!(nnzr);
3952: a->ilen[i] = a->imax[i] = nnzr;
3953: a->rmax = PetscMax(a->rmax,nnzr);
3954: }
3955: a->nonzerorowcnt = nzr;
3956: A->preallocated = PETSC_TRUE;
3957: PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3958: MatMarkDiagonal_SeqAIJ(A);
3959: } else {
3960: MatSeqAIJSetPreallocation(A,0,NULL);
3961: }
3962: MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
3964: /* We want to allocate the CUSPARSE struct for matvec now.
3965: The code is so convoluted now that I prefer to copy zeros */
3966: PetscArrayzero(a->a,a->nz);
3967: MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3968: A->offloadmask = PETSC_OFFLOAD_CPU;
3969: A->nonzerostate++;
3970: MatSeqAIJCUSPARSECopyToGPU(A);
3971: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3973: A->assembled = PETSC_FALSE;
3974: A->was_assembled = PETSC_FALSE;
3975: return(0);
3976: }
3978: /*@C
3979: MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3981: Not collective
3983: Input Parameters:
3984: + A - the matrix
3985: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
3987: Output Parameters:
3988: + ia - the CSR row pointers
3989: - ja - the CSR column indices
3991: Level: developer
3993: Notes:
3994: When compressed is true, the CSR structure does not contain empty rows
3996: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3997: @*/
3998: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3999: {
4000: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4001: CsrMatrix *csr;
4002: PetscErrorCode ierr;
4003: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4007: if (!i || !j) return(0);
4009: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4010: MatSeqAIJCUSPARSECopyToGPU(A);
4011: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4012: csr = (CsrMatrix*)cusp->mat->mat;
4013: if (i) {
4014: if (!compressed && a->compressedrow.use) { /* need full row offset */
4015: if (!cusp->rowoffsets_gpu) {
4016: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4017: cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4018: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4019: }
4020: *i = cusp->rowoffsets_gpu->data().get();
4021: } else *i = csr->row_offsets->data().get();
4022: }
4023: if (j) *j = csr->column_indices->data().get();
4024: return(0);
4025: }
4027: /*@C
4028: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4030: Not collective
4032: Input Parameters:
4033: + A - the matrix
4034: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4036: Output Parameters:
4037: + ia - the CSR row pointers
4038: - ja - the CSR column indices
4040: Level: developer
4042: .seealso: MatSeqAIJCUSPARSEGetIJ()
4043: @*/
4044: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4045: {
4049: if (i) *i = NULL;
4050: if (j) *j = NULL;
4051: return(0);
4052: }
4054: /*@C
4055: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4057: Not Collective
4059: Input Parameter:
4060: . A - a MATSEQAIJCUSPARSE matrix
4062: Output Parameter:
4063: . a - pointer to the device data
4065: Level: developer
4067: Notes: may trigger host-device copies if up-to-date matrix data is on host
4069: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4070: @*/
4071: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4072: {
4073: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4074: CsrMatrix *csr;
4075: PetscErrorCode ierr;
4081: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4082: MatSeqAIJCUSPARSECopyToGPU(A);
4083: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4084: csr = (CsrMatrix*)cusp->mat->mat;
4085: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4086: *a = csr->values->data().get();
4087: return(0);
4088: }
4090: /*@C
4091: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4093: Not Collective
4095: Input Parameter:
4096: . A - a MATSEQAIJCUSPARSE matrix
4098: Output Parameter:
4099: . a - pointer to the device data
4101: Level: developer
4103: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4104: @*/
4105: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4106: {
4111: *a = NULL;
4112: return(0);
4113: }
4115: /*@C
4116: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4118: Not Collective
4120: Input Parameter:
4121: . A - a MATSEQAIJCUSPARSE matrix
4123: Output Parameter:
4124: . a - pointer to the device data
4126: Level: developer
4128: Notes: may trigger host-device copies if up-to-date matrix data is on host
4130: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4131: @*/
4132: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4133: {
4134: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4135: CsrMatrix *csr;
4136: PetscErrorCode ierr;
4142: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143: MatSeqAIJCUSPARSECopyToGPU(A);
4144: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145: csr = (CsrMatrix*)cusp->mat->mat;
4146: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4147: *a = csr->values->data().get();
4148: A->offloadmask = PETSC_OFFLOAD_GPU;
4149: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4150: return(0);
4151: }
4152: /*@C
4153: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4155: Not Collective
4157: Input Parameter:
4158: . A - a MATSEQAIJCUSPARSE matrix
4160: Output Parameter:
4161: . a - pointer to the device data
4163: Level: developer
4165: .seealso: MatSeqAIJCUSPARSEGetArray()
4166: @*/
4167: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4168: {
4175: PetscObjectStateIncrease((PetscObject)A);
4176: *a = NULL;
4177: return(0);
4178: }
4180: /*@C
4181: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4183: Not Collective
4185: Input Parameter:
4186: . A - a MATSEQAIJCUSPARSE matrix
4188: Output Parameter:
4189: . a - pointer to the device data
4191: Level: developer
4193: Notes: does not trigger host-device copies and flags data validity on the GPU
4195: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4196: @*/
4197: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4198: {
4199: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4200: CsrMatrix *csr;
4201: PetscErrorCode ierr;
4207: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4208: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4209: csr = (CsrMatrix*)cusp->mat->mat;
4210: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4211: *a = csr->values->data().get();
4212: A->offloadmask = PETSC_OFFLOAD_GPU;
4213: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4214: return(0);
4215: }
4217: /*@C
4218: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4220: Not Collective
4222: Input Parameter:
4223: . A - a MATSEQAIJCUSPARSE matrix
4225: Output Parameter:
4226: . a - pointer to the device data
4228: Level: developer
4230: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4231: @*/
4232: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4233: {
4240: PetscObjectStateIncrease((PetscObject)A);
4241: *a = NULL;
4242: return(0);
4243: }
4245: struct IJCompare4
4246: {
4247: __host__ __device__
4248: inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4249: {
4250: if (t1.get<0>() < t2.get<0>()) return true;
4251: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4252: return false;
4253: }
4254: };
4256: struct Shift
4257: {
4258: int _shift;
4260: Shift(int shift) : _shift(shift) {}
4261: __host__ __device__
4262: inline int operator() (const int &c)
4263: {
4264: return c + _shift;
4265: }
4266: };
4268: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4269: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4270: {
4271: PetscErrorCode ierr;
4272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4273: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4274: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4275: CsrMatrix *Acsr,*Bcsr,*Ccsr;
4276: PetscInt Annz,Bnnz;
4277: cusparseStatus_t stat;
4278: PetscInt i,m,n,zero = 0;
4279: cudaError_t cerr;
4287: if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4288: if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4289: if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4290: if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4291: if (reuse == MAT_INITIAL_MATRIX) {
4292: m = A->rmap->n;
4293: n = A->cmap->n + B->cmap->n;
4294: MatCreate(PETSC_COMM_SELF,C);
4295: MatSetSizes(*C,m,n,m,n);
4296: MatSetType(*C,MATSEQAIJCUSPARSE);
4297: c = (Mat_SeqAIJ*)(*C)->data;
4298: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4299: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4300: Ccsr = new CsrMatrix;
4301: Cmat->cprowIndices = NULL;
4302: c->compressedrow.use = PETSC_FALSE;
4303: c->compressedrow.nrows = 0;
4304: c->compressedrow.i = NULL;
4305: c->compressedrow.rindex = NULL;
4306: Ccusp->workVector = NULL;
4307: Ccusp->nrows = m;
4308: Ccusp->mat = Cmat;
4309: Ccusp->mat->mat = Ccsr;
4310: Ccsr->num_rows = m;
4311: Ccsr->num_cols = n;
4312: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4313: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4314: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4315: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4316: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4317: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4318: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4319: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4320: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4321: MatSeqAIJCUSPARSECopyToGPU(A);
4322: MatSeqAIJCUSPARSECopyToGPU(B);
4323: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4324: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4326: Acsr = (CsrMatrix*)Acusp->mat->mat;
4327: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4328: Annz = (PetscInt)Acsr->column_indices->size();
4329: Bnnz = (PetscInt)Bcsr->column_indices->size();
4330: c->nz = Annz + Bnnz;
4331: Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4332: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4333: Ccsr->values = new THRUSTARRAY(c->nz);
4334: Ccsr->num_entries = c->nz;
4335: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4336: if (c->nz) {
4337: auto Acoo = new THRUSTINTARRAY32(Annz);
4338: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4339: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4340: THRUSTINTARRAY32 *Aroff,*Broff;
4342: if (a->compressedrow.use) { /* need full row offset */
4343: if (!Acusp->rowoffsets_gpu) {
4344: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4345: Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4346: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4347: }
4348: Aroff = Acusp->rowoffsets_gpu;
4349: } else Aroff = Acsr->row_offsets;
4350: if (b->compressedrow.use) { /* need full row offset */
4351: if (!Bcusp->rowoffsets_gpu) {
4352: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4353: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4354: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4355: }
4356: Broff = Bcusp->rowoffsets_gpu;
4357: } else Broff = Bcsr->row_offsets;
4358: PetscLogGpuTimeBegin();
4359: stat = cusparseXcsr2coo(Acusp->handle,
4360: Aroff->data().get(),
4361: Annz,
4362: m,
4363: Acoo->data().get(),
4364: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4365: stat = cusparseXcsr2coo(Bcusp->handle,
4366: Broff->data().get(),
4367: Bnnz,
4368: m,
4369: Bcoo->data().get(),
4370: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4371: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4372: auto Aperm = thrust::make_constant_iterator(1);
4373: auto Bperm = thrust::make_constant_iterator(0);
4374: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4375: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4376: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4377: #else
4378: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4379: auto Bcib = Bcsr->column_indices->begin();
4380: auto Bcie = Bcsr->column_indices->end();
4381: thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4382: #endif
4383: auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4384: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4385: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4386: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4387: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4388: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4389: auto p1 = Ccusp->cooPerm->begin();
4390: auto p2 = Ccusp->cooPerm->begin();
4391: thrust::advance(p2,Annz);
4392: PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4393: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4394: thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4395: #endif
4396: auto cci = thrust::make_counting_iterator(zero);
4397: auto cce = thrust::make_counting_iterator(c->nz);
4398: #if 0 //Errors on SUMMIT cuda 11.1.0
4399: PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4400: #else
4401: auto pred = thrust::identity<int>();
4402: PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4403: PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4404: #endif
4405: stat = cusparseXcoo2csr(Ccusp->handle,
4406: Ccoo->data().get(),
4407: c->nz,
4408: m,
4409: Ccsr->row_offsets->data().get(),
4410: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4411: PetscLogGpuTimeEnd();
4412: delete wPerm;
4413: delete Acoo;
4414: delete Bcoo;
4415: delete Ccoo;
4416: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4417: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4418: Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4419: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4420: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4421: #endif
4422: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4423: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4424: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4425: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4426: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4427: CsrMatrix *CcsrT = new CsrMatrix;
4428: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4429: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4431: (*C)->form_explicit_transpose = PETSC_TRUE;
4432: (*C)->transupdated = PETSC_TRUE;
4433: Ccusp->rowoffsets_gpu = NULL;
4434: CmatT->cprowIndices = NULL;
4435: CmatT->mat = CcsrT;
4436: CcsrT->num_rows = n;
4437: CcsrT->num_cols = m;
4438: CcsrT->num_entries = c->nz;
4440: CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4441: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4442: CcsrT->values = new THRUSTARRAY(c->nz);
4444: PetscLogGpuTimeBegin();
4445: auto rT = CcsrT->row_offsets->begin();
4446: if (AT) {
4447: rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4448: thrust::advance(rT,-1);
4449: }
4450: if (BT) {
4451: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4452: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4453: thrust::copy(titb,tite,rT);
4454: }
4455: auto cT = CcsrT->column_indices->begin();
4456: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4457: if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4458: auto vT = CcsrT->values->begin();
4459: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4460: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4461: PetscLogGpuTimeEnd();
4463: stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4464: stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4465: stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4466: cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4467: cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4468: cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4469: cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4470: cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4471: cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4472: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4473: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4474: CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4475: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4476: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4477: #endif
4478: Ccusp->matTranspose = CmatT;
4479: }
4480: }
4482: c->singlemalloc = PETSC_FALSE;
4483: c->free_a = PETSC_TRUE;
4484: c->free_ij = PETSC_TRUE;
4485: PetscMalloc1(m+1,&c->i);
4486: PetscMalloc1(c->nz,&c->j);
4487: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4488: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4489: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4490: ii = *Ccsr->row_offsets;
4491: jj = *Ccsr->column_indices;
4492: cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4493: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4494: } else {
4495: cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4496: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4497: }
4498: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4499: PetscMalloc1(m,&c->ilen);
4500: PetscMalloc1(m,&c->imax);
4501: c->maxnz = c->nz;
4502: c->nonzerorowcnt = 0;
4503: c->rmax = 0;
4504: for (i = 0; i < m; i++) {
4505: const PetscInt nn = c->i[i+1] - c->i[i];
4506: c->ilen[i] = c->imax[i] = nn;
4507: c->nonzerorowcnt += (PetscInt)!!nn;
4508: c->rmax = PetscMax(c->rmax,nn);
4509: }
4510: MatMarkDiagonal_SeqAIJ(*C);
4511: PetscMalloc1(c->nz,&c->a);
4512: (*C)->nonzerostate++;
4513: PetscLayoutSetUp((*C)->rmap);
4514: PetscLayoutSetUp((*C)->cmap);
4515: Ccusp->nonzerostate = (*C)->nonzerostate;
4516: (*C)->preallocated = PETSC_TRUE;
4517: } else {
4518: if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4519: c = (Mat_SeqAIJ*)(*C)->data;
4520: if (c->nz) {
4521: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4522: if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4523: if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4524: if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4525: MatSeqAIJCUSPARSECopyToGPU(A);
4526: MatSeqAIJCUSPARSECopyToGPU(B);
4527: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4528: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4529: Acsr = (CsrMatrix*)Acusp->mat->mat;
4530: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4531: Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4532: if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4533: if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4534: if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4535: if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4536: if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4537: auto pmid = Ccusp->cooPerm->begin();
4538: thrust::advance(pmid,Acsr->num_entries);
4539: PetscLogGpuTimeBegin();
4540: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4541: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4542: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4543: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4544: thrust::for_each(zibait,zieait,VecCUDAEquals());
4545: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4546: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4547: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4548: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4549: thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4550: MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4551: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4552: if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4553: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4554: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4555: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4556: CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4557: auto vT = CcsrT->values->begin();
4558: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4559: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4560: (*C)->transupdated = PETSC_TRUE;
4561: }
4562: PetscLogGpuTimeEnd();
4563: }
4564: }
4565: PetscObjectStateIncrease((PetscObject)*C);
4566: (*C)->assembled = PETSC_TRUE;
4567: (*C)->was_assembled = PETSC_FALSE;
4568: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4569: return(0);
4570: }
4572: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4573: {
4574: PetscErrorCode ierr;
4575: bool dmem;
4576: const PetscScalar *av;
4577: cudaError_t cerr;
4580: dmem = isCudaMem(v);
4581: MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4582: if (n && idx) {
4583: THRUSTINTARRAY widx(n);
4584: widx.assign(idx,idx+n);
4585: PetscLogCpuToGpu(n*sizeof(PetscInt));
4587: THRUSTARRAY *w = NULL;
4588: thrust::device_ptr<PetscScalar> dv;
4589: if (dmem) {
4590: dv = thrust::device_pointer_cast(v);
4591: } else {
4592: w = new THRUSTARRAY(n);
4593: dv = w->data();
4594: }
4595: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4597: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4598: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4599: thrust::for_each(zibit,zieit,VecCUDAEquals());
4600: if (w) {
4601: cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4602: }
4603: delete w;
4604: } else {
4605: cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606: }
4607: if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4608: MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4609: return(0);
4610: }