Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16: #include <thrust/async/for_each.h>
17: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18: #include <cooperative_groups.h>
19: #endif
20: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
25: typedef enum {
26: CUSPARSE_MV_ALG_DEFAULT = 0,
27: CUSPARSE_COOMV_ALG = 1,
28: CUSPARSE_CSRMV_ALG1 = 2,
29: CUSPARSE_CSRMV_ALG2 = 3
30: } cusparseSpMVAlg_t;
32: typedef enum {
33: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
35: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
36: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
37: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
38: CUSPARSE_SPMM_ALG_DEFAULT = 0,
39: CUSPARSE_SPMM_COO_ALG1 = 1,
40: CUSPARSE_SPMM_COO_ALG2 = 2,
41: CUSPARSE_SPMM_COO_ALG3 = 3,
42: CUSPARSE_SPMM_COO_ALG4 = 5,
43: CUSPARSE_SPMM_CSR_ALG1 = 4,
44: CUSPARSE_SPMM_CSR_ALG2 = 6,
45: } cusparseSpMMAlg_t;
47: typedef enum {
48: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
50: } cusparseCsr2CscAlg_t;
51: */
52: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55: #endif
57: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
89: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
90: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
93: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
94: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
96: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
98: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99: {
100: cusparseStatus_t stat;
101: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
104: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105: cusparsestruct->stream = stream;
106: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107: return(0);
108: }
110: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111: {
112: cusparseStatus_t stat;
113: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
116: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117: if (cusparsestruct->handle != handle) {
118: if (cusparsestruct->handle) {
119: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120: }
121: cusparsestruct->handle = handle;
122: }
123: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124: return(0);
125: }
127: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128: {
129: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130: PetscBool flg;
131: PetscErrorCode ierr;
134: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
135: if (!flg || !cusparsestruct) return(0);
136: if (cusparsestruct->handle) cusparsestruct->handle = 0;
137: return(0);
138: }
140: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141: {
143: *type = MATSOLVERCUSPARSE;
144: return(0);
145: }
147: /*MC
148: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153: algorithms are not recommended. This class does NOT support direct solver operations.
155: Level: beginner
157: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: M*/
160: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161: {
163: PetscInt n = A->rmap->n;
166: MatCreate(PetscObjectComm((PetscObject)A),B);
167: MatSetSizes(*B,n,n,n,n);
168: (*B)->factortype = ftype;
169: (*B)->useordering = PETSC_TRUE;
170: MatSetType(*B,MATSEQAIJCUSPARSE);
172: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173: MatSetBlockSizesFromMats(*B,A,A);
174: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
181: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
182: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
183: return(0);
184: }
186: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187: {
188: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191: switch (op) {
192: case MAT_CUSPARSE_MULT:
193: cusparsestruct->format = format;
194: break;
195: case MAT_CUSPARSE_ALL:
196: cusparsestruct->format = format;
197: break;
198: default:
199: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200: }
201: return(0);
202: }
204: /*@
205: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206: operation. Only the MatMult operation can use different GPU storage formats
207: for MPIAIJCUSPARSE matrices.
208: Not Collective
210: Input Parameters:
211: + A - Matrix of type SEQAIJCUSPARSE
212: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
215: Output Parameter:
217: Level: intermediate
219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222: {
227: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
228: return(0);
229: }
231: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232: {
236: switch (op) {
237: case MAT_FORM_EXPLICIT_TRANSPOSE:
238: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239: if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
240: A->form_explicit_transpose = flg;
241: break;
242: default:
243: MatSetOption_SeqAIJ(A,op,flg);
244: break;
245: }
246: return(0);
247: }
249: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252: {
253: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
254: IS isrow = b->row,iscol = b->col;
255: PetscBool row_identity,col_identity;
259: MatSeqAIJCUSPARSECopyFromGPU(A);
260: MatLUFactorNumeric_SeqAIJ(B,A,info);
261: B->offloadmask = PETSC_OFFLOAD_CPU;
262: /* determine which version of MatSolve needs to be used. */
263: ISIdentity(isrow,&row_identity);
264: ISIdentity(iscol,&col_identity);
265: if (row_identity && col_identity) {
266: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268: B->ops->matsolve = NULL;
269: B->ops->matsolvetranspose = NULL;
270: } else {
271: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273: B->ops->matsolve = NULL;
274: B->ops->matsolvetranspose = NULL;
275: }
277: /* get the triangular factors */
278: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
279: return(0);
280: }
282: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283: {
284: PetscErrorCode ierr;
285: MatCUSPARSEStorageFormat format;
286: PetscBool flg;
287: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
290: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
291: if (A->factortype == MAT_FACTOR_NONE) {
292: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
294: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
296: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
298: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
299: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
305: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
307: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
309: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
311: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312: #endif
313: }
314: PetscOptionsTail();
315: return(0);
316: }
318: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319: {
320: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321: PetscErrorCode ierr;
324: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
325: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
326: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327: return(0);
328: }
330: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331: {
332: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333: PetscErrorCode ierr;
336: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
337: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
338: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339: return(0);
340: }
342: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343: {
344: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345: PetscErrorCode ierr;
348: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
349: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
350: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351: return(0);
352: }
354: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355: {
356: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357: PetscErrorCode ierr;
360: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
361: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
362: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363: return(0);
364: }
366: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367: {
368: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
369: PetscInt n = A->rmap->n;
370: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372: cusparseStatus_t stat;
373: const PetscInt *ai = a->i,*aj = a->j,*vi;
374: const MatScalar *aa = a->a,*v;
375: PetscInt *AiLo, *AjLo;
376: PetscInt i,nz, nzLower, offset, rowOffset;
377: PetscErrorCode ierr;
378: cudaError_t cerr;
381: if (!n) return(0);
382: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383: try {
384: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385: nzLower=n+ai[n]-ai[1];
386: if (!loTriFactor) {
387: PetscScalar *AALo;
389: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
391: /* Allocate Space for the lower triangular matrix */
392: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
395: /* Fill the lower triangular matrix */
396: AiLo[0] = (PetscInt) 0;
397: AiLo[n] = nzLower;
398: AjLo[0] = (PetscInt) 0;
399: AALo[0] = (MatScalar) 1.0;
400: v = aa;
401: vi = aj;
402: offset = 1;
403: rowOffset= 1;
404: for (i=1; i<n; i++) {
405: nz = ai[i+1] - ai[i];
406: /* additional 1 for the term on the diagonal */
407: AiLo[i] = rowOffset;
408: rowOffset += nz+1;
410: PetscArraycpy(&(AjLo[offset]), vi, nz);
411: PetscArraycpy(&(AALo[offset]), v, nz);
413: offset += nz;
414: AjLo[offset] = (PetscInt) i;
415: AALo[offset] = (MatScalar) 1.0;
416: offset += 1;
418: v += nz;
419: vi += nz;
420: }
422: /* allocate space for the triangular factor information */
423: PetscNew(&loTriFactor);
424: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425: /* Create the matrix description */
426: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430: #else
431: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432: #endif
433: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
436: /* set the operation */
437: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
439: /* set the matrix */
440: loTriFactor->csrMat = new CsrMatrix;
441: loTriFactor->csrMat->num_rows = n;
442: loTriFactor->csrMat->num_cols = n;
443: loTriFactor->csrMat->num_entries = nzLower;
445: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
448: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
451: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
454: /* Create the solve analysis information */
455: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
456: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464: #endif
466: /* perform the solve analysis */
467: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473: #endif
474: );CHKERRCUSPARSE(stat);
475: cerr = WaitForCUDA();CHKERRCUDA(cerr);
476: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
478: /* assign the pointer */
479: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480: loTriFactor->AA_h = AALo;
481: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
484: } else { /* update values only */
485: if (!loTriFactor->AA_h) {
486: cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487: }
488: /* Fill the lower triangular matrix */
489: loTriFactor->AA_h[0] = 1.0;
490: v = aa;
491: vi = aj;
492: offset = 1;
493: for (i=1; i<n; i++) {
494: nz = ai[i+1] - ai[i];
495: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
496: offset += nz;
497: loTriFactor->AA_h[offset] = 1.0;
498: offset += 1;
499: v += nz;
500: }
501: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502: PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
503: }
504: } catch(char *ex) {
505: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506: }
507: }
508: return(0);
509: }
511: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512: {
513: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
514: PetscInt n = A->rmap->n;
515: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517: cusparseStatus_t stat;
518: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
519: const MatScalar *aa = a->a,*v;
520: PetscInt *AiUp, *AjUp;
521: PetscInt i,nz, nzUpper, offset;
522: PetscErrorCode ierr;
523: cudaError_t cerr;
526: if (!n) return(0);
527: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528: try {
529: /* next, figure out the number of nonzeros in the upper triangular matrix. */
530: nzUpper = adiag[0]-adiag[n];
531: if (!upTriFactor) {
532: PetscScalar *AAUp;
534: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536: /* Allocate Space for the upper triangular matrix */
537: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540: /* Fill the upper triangular matrix */
541: AiUp[0]=(PetscInt) 0;
542: AiUp[n]=nzUpper;
543: offset = nzUpper;
544: for (i=n-1; i>=0; i--) {
545: v = aa + adiag[i+1] + 1;
546: vi = aj + adiag[i+1] + 1;
548: /* number of elements NOT on the diagonal */
549: nz = adiag[i] - adiag[i+1]-1;
551: /* decrement the offset */
552: offset -= (nz+1);
554: /* first, set the diagonal elements */
555: AjUp[offset] = (PetscInt) i;
556: AAUp[offset] = (MatScalar)1./v[nz];
557: AiUp[i] = AiUp[i+1] - (nz+1);
559: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
560: PetscArraycpy(&(AAUp[offset+1]), v, nz);
561: }
563: /* allocate space for the triangular factor information */
564: PetscNew(&upTriFactor);
565: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567: /* Create the matrix description */
568: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572: #else
573: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574: #endif
575: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578: /* set the operation */
579: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581: /* set the matrix */
582: upTriFactor->csrMat = new CsrMatrix;
583: upTriFactor->csrMat->num_rows = n;
584: upTriFactor->csrMat->num_cols = n;
585: upTriFactor->csrMat->num_entries = nzUpper;
587: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596: /* Create the solve analysis information */
597: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
598: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606: #endif
608: /* perform the solve analysis */
609: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615: #endif
616: );CHKERRCUSPARSE(stat);
617: cerr = WaitForCUDA();CHKERRCUDA(cerr);
618: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
620: /* assign the pointer */
621: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622: upTriFactor->AA_h = AAUp;
623: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
626: } else {
627: if (!upTriFactor->AA_h) {
628: cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629: }
630: /* Fill the upper triangular matrix */
631: offset = nzUpper;
632: for (i=n-1; i>=0; i--) {
633: v = aa + adiag[i+1] + 1;
635: /* number of elements NOT on the diagonal */
636: nz = adiag[i] - adiag[i+1]-1;
638: /* decrement the offset */
639: offset -= (nz+1);
641: /* first, set the diagonal elements */
642: upTriFactor->AA_h[offset] = 1./v[nz];
643: PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
644: }
645: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646: PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
647: }
648: } catch(char *ex) {
649: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650: }
651: }
652: return(0);
653: }
655: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656: {
657: PetscErrorCode ierr;
658: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
659: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660: IS isrow = a->row,iscol = a->icol;
661: PetscBool row_identity,col_identity;
662: PetscInt n = A->rmap->n;
665: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
667: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
669: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670: cusparseTriFactors->nnz=a->nz;
672: A->offloadmask = PETSC_OFFLOAD_BOTH;
673: /* lower triangular indices */
674: ISIdentity(isrow,&row_identity);
675: if (!row_identity && !cusparseTriFactors->rpermIndices) {
676: const PetscInt *r;
678: ISGetIndices(isrow,&r);
679: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680: cusparseTriFactors->rpermIndices->assign(r, r+n);
681: ISRestoreIndices(isrow,&r);
682: PetscLogCpuToGpu(n*sizeof(PetscInt));
683: }
685: /* upper triangular indices */
686: ISIdentity(iscol,&col_identity);
687: if (!col_identity && !cusparseTriFactors->cpermIndices) {
688: const PetscInt *c;
690: ISGetIndices(iscol,&c);
691: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692: cusparseTriFactors->cpermIndices->assign(c, c+n);
693: ISRestoreIndices(iscol,&c);
694: PetscLogCpuToGpu(n*sizeof(PetscInt));
695: }
696: return(0);
697: }
699: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700: {
701: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
702: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705: cusparseStatus_t stat;
706: PetscErrorCode ierr;
707: cudaError_t cerr;
708: PetscInt *AiUp, *AjUp;
709: PetscScalar *AAUp;
710: PetscScalar *AALo;
711: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
713: const PetscInt *ai = b->i,*aj = b->j,*vj;
714: const MatScalar *aa = b->a,*v;
717: if (!n) return(0);
718: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719: try {
720: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722: if (!upTriFactor && !loTriFactor) {
723: /* Allocate Space for the upper triangular matrix */
724: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
727: /* Fill the upper triangular matrix */
728: AiUp[0]=(PetscInt) 0;
729: AiUp[n]=nzUpper;
730: offset = 0;
731: for (i=0; i<n; i++) {
732: /* set the pointers */
733: v = aa + ai[i];
734: vj = aj + ai[i];
735: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
737: /* first, set the diagonal elements */
738: AjUp[offset] = (PetscInt) i;
739: AAUp[offset] = (MatScalar)1.0/v[nz];
740: AiUp[i] = offset;
741: AALo[offset] = (MatScalar)1.0/v[nz];
743: offset+=1;
744: if (nz>0) {
745: PetscArraycpy(&(AjUp[offset]), vj, nz);
746: PetscArraycpy(&(AAUp[offset]), v, nz);
747: for (j=offset; j<offset+nz; j++) {
748: AAUp[j] = -AAUp[j];
749: AALo[j] = AAUp[j]/v[nz];
750: }
751: offset+=nz;
752: }
753: }
755: /* allocate space for the triangular factor information */
756: PetscNew(&upTriFactor);
757: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
759: /* Create the matrix description */
760: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764: #else
765: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766: #endif
767: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
770: /* set the matrix */
771: upTriFactor->csrMat = new CsrMatrix;
772: upTriFactor->csrMat->num_rows = A->rmap->n;
773: upTriFactor->csrMat->num_cols = A->cmap->n;
774: upTriFactor->csrMat->num_entries = a->nz;
776: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
779: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
782: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
785: /* set the operation */
786: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788: /* Create the solve analysis information */
789: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
790: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798: #endif
800: /* perform the solve analysis */
801: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807: #endif
808: );CHKERRCUSPARSE(stat);
809: cerr = WaitForCUDA();CHKERRCUDA(cerr);
810: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
812: /* assign the pointer */
813: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
815: /* allocate space for the triangular factor information */
816: PetscNew(&loTriFactor);
817: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
819: /* Create the matrix description */
820: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824: #else
825: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826: #endif
827: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
830: /* set the operation */
831: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
833: /* set the matrix */
834: loTriFactor->csrMat = new CsrMatrix;
835: loTriFactor->csrMat->num_rows = A->rmap->n;
836: loTriFactor->csrMat->num_cols = A->cmap->n;
837: loTriFactor->csrMat->num_entries = a->nz;
839: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
842: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
845: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
848: /* Create the solve analysis information */
849: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
850: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858: #endif
860: /* perform the solve analysis */
861: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867: #endif
868: );CHKERRCUSPARSE(stat);
869: cerr = WaitForCUDA();CHKERRCUDA(cerr);
870: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
872: /* assign the pointer */
873: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
875: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
876: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878: } else {
879: /* Fill the upper triangular matrix */
880: offset = 0;
881: for (i=0; i<n; i++) {
882: /* set the pointers */
883: v = aa + ai[i];
884: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
886: /* first, set the diagonal elements */
887: AAUp[offset] = 1.0/v[nz];
888: AALo[offset] = 1.0/v[nz];
890: offset+=1;
891: if (nz>0) {
892: PetscArraycpy(&(AAUp[offset]), v, nz);
893: for (j=offset; j<offset+nz; j++) {
894: AAUp[j] = -AAUp[j];
895: AALo[j] = AAUp[j]/v[nz];
896: }
897: offset+=nz;
898: }
899: }
900: if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901: if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904: PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
905: }
906: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908: } catch(char *ex) {
909: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910: }
911: }
912: return(0);
913: }
915: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916: {
917: PetscErrorCode ierr;
918: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
919: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920: IS ip = a->row;
921: PetscBool perm_identity;
922: PetscInt n = A->rmap->n;
925: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
927: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
930: A->offloadmask = PETSC_OFFLOAD_BOTH;
932: /* lower triangular indices */
933: ISIdentity(ip,&perm_identity);
934: if (!perm_identity) {
935: IS iip;
936: const PetscInt *irip,*rip;
938: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
939: ISGetIndices(iip,&irip);
940: ISGetIndices(ip,&rip);
941: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945: ISRestoreIndices(iip,&irip);
946: ISDestroy(&iip);
947: ISRestoreIndices(ip,&rip);
948: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
949: }
950: return(0);
951: }
953: #define CHECK_LAUNCH_ERROR() \
954: do { \
955: /* Check synchronous errors, i.e. pre-launch */ \
956: cudaError_t err = cudaGetLastError(); \
957: if (cudaSuccess != err) { \
958: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959: } \
960: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
961: err = cudaDeviceSynchronize(); \
962: if (cudaSuccess != err) { \
963: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964: } \
965: } while (0)
967: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968: {
969: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
970: IS ip = b->row;
971: PetscBool perm_identity;
975: MatSeqAIJCUSPARSECopyFromGPU(A);
976: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
977: B->offloadmask = PETSC_OFFLOAD_CPU;
978: /* determine which version of MatSolve needs to be used. */
979: ISIdentity(ip,&perm_identity);
980: if (perm_identity) {
981: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983: B->ops->matsolve = NULL;
984: B->ops->matsolvetranspose = NULL;
985: } else {
986: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988: B->ops->matsolve = NULL;
989: B->ops->matsolvetranspose = NULL;
990: }
992: /* get the triangular factors */
993: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
994: return(0);
995: }
997: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998: {
999: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004: cusparseStatus_t stat;
1005: cusparseIndexBase_t indexBase;
1006: cusparseMatrixType_t matrixType;
1007: cusparseFillMode_t fillMode;
1008: cusparseDiagType_t diagType;
1009: cudaError_t cerr;
1010: PetscErrorCode ierr;
1013: /* allocate space for the transpose of the lower triangular factor */
1014: PetscNew(&loTriFactorT);
1015: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1017: /* set the matrix descriptors of the lower triangular factor */
1018: matrixType = cusparseGetMatType(loTriFactor->descr);
1019: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1024: /* Create the matrix description */
1025: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1031: /* set the operation */
1032: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1034: /* allocate GPU space for the CSC of the lower triangular factor*/
1035: loTriFactorT->csrMat = new CsrMatrix;
1036: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1037: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1038: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1039: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1043: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047: loTriFactor->csrMat->values->data().get(),
1048: loTriFactor->csrMat->row_offsets->data().get(),
1049: loTriFactor->csrMat->column_indices->data().get(),
1050: loTriFactorT->csrMat->values->data().get(),
1051: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052: CUSPARSE_ACTION_NUMERIC,indexBase,
1053: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054: cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055: #endif
1057: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1058: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060: loTriFactor->csrMat->values->data().get(),
1061: loTriFactor->csrMat->row_offsets->data().get(),
1062: loTriFactor->csrMat->column_indices->data().get(),
1063: loTriFactorT->csrMat->values->data().get(),
1064: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066: CUSPARSE_ACTION_NUMERIC, indexBase,
1067: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068: #else
1069: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070: CUSPARSE_ACTION_NUMERIC, indexBase
1071: #endif
1072: );CHKERRCUSPARSE(stat);
1073: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1076: /* Create the solve analysis information */
1077: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1078: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086: #endif
1088: /* perform the solve analysis */
1089: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095: #endif
1096: );CHKERRCUSPARSE(stat);
1097: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1100: /* assign the pointer */
1101: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1103: /*********************************************/
1104: /* Now the Transpose of the Upper Tri Factor */
1105: /*********************************************/
1107: /* allocate space for the transpose of the upper triangular factor */
1108: PetscNew(&upTriFactorT);
1109: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1111: /* set the matrix descriptors of the upper triangular factor */
1112: matrixType = cusparseGetMatType(upTriFactor->descr);
1113: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1118: /* Create the matrix description */
1119: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1125: /* set the operation */
1126: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1128: /* allocate GPU space for the CSC of the upper triangular factor*/
1129: upTriFactorT->csrMat = new CsrMatrix;
1130: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1131: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1132: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1133: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1137: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141: upTriFactor->csrMat->values->data().get(),
1142: upTriFactor->csrMat->row_offsets->data().get(),
1143: upTriFactor->csrMat->column_indices->data().get(),
1144: upTriFactorT->csrMat->values->data().get(),
1145: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146: CUSPARSE_ACTION_NUMERIC,indexBase,
1147: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149: #endif
1151: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1152: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154: upTriFactor->csrMat->values->data().get(),
1155: upTriFactor->csrMat->row_offsets->data().get(),
1156: upTriFactor->csrMat->column_indices->data().get(),
1157: upTriFactorT->csrMat->values->data().get(),
1158: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160: CUSPARSE_ACTION_NUMERIC, indexBase,
1161: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162: #else
1163: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164: CUSPARSE_ACTION_NUMERIC, indexBase
1165: #endif
1166: );CHKERRCUSPARSE(stat);
1167: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1170: /* Create the solve analysis information */
1171: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1172: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180: #endif
1182: /* perform the solve analysis */
1183: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189: #endif
1190: );CHKERRCUSPARSE(stat);
1191: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1194: /* assign the pointer */
1195: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196: return(0);
1197: }
1199: struct PetscScalarToPetscInt
1200: {
1201: __host__ __device__
1202: PetscInt operator()(PetscScalar s)
1203: {
1204: return (PetscInt)PetscRealPart(s);
1205: }
1206: };
1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1213: cusparseStatus_t stat;
1214: cusparseIndexBase_t indexBase;
1215: cudaError_t err;
1216: PetscErrorCode ierr;
1219: if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220: MatSeqAIJCUSPARSECopyToGPU(A);
1221: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222: if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224: if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225: if (A->transupdated) return(0);
1226: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227: if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229: }
1230: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1237: /* set alpha and beta */
1238: err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246: CsrMatrix *matrixT = new CsrMatrix;
1247: matstructT->mat = matrixT;
1248: matrixT->num_rows = A->cmap->n;
1249: matrixT->num_cols = A->rmap->n;
1250: matrixT->num_entries = a->nz;
1251: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253: matrixT->values = new THRUSTARRAY(a->nz);
1255: if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1258: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259: stat = cusparseCreateCsr(&matstructT->matDescr,
1260: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262: matrixT->values->data().get(),
1263: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265: #endif
1266: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269: #else
1270: CsrMatrix *temp = new CsrMatrix;
1271: CsrMatrix *tempT = new CsrMatrix;
1272: /* First convert HYB to CSR */
1273: temp->num_rows = A->rmap->n;
1274: temp->num_cols = A->cmap->n;
1275: temp->num_entries = a->nz;
1276: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278: temp->values = new THRUSTARRAY(a->nz);
1280: stat = cusparse_hyb2csr(cusparsestruct->handle,
1281: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282: temp->values->data().get(),
1283: temp->row_offsets->data().get(),
1284: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1286: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287: tempT->num_rows = A->rmap->n;
1288: tempT->num_cols = A->cmap->n;
1289: tempT->num_entries = a->nz;
1290: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292: tempT->values = new THRUSTARRAY(a->nz);
1294: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295: temp->num_cols, temp->num_entries,
1296: temp->values->data().get(),
1297: temp->row_offsets->data().get(),
1298: temp->column_indices->data().get(),
1299: tempT->values->data().get(),
1300: tempT->column_indices->data().get(),
1301: tempT->row_offsets->data().get(),
1302: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1304: /* Last, convert CSC to HYB */
1305: cusparseHybMat_t hybMat;
1306: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310: matstructT->descr, tempT->values->data().get(),
1311: tempT->row_offsets->data().get(),
1312: tempT->column_indices->data().get(),
1313: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1315: /* assign the pointer */
1316: matstructT->mat = hybMat;
1317: A->transupdated = PETSC_TRUE;
1318: /* delete temporaries */
1319: if (tempT) {
1320: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323: delete (CsrMatrix*) tempT;
1324: }
1325: if (temp) {
1326: if (temp->values) delete (THRUSTARRAY*) temp->values;
1327: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329: delete (CsrMatrix*) temp;
1330: }
1331: #endif
1332: }
1333: }
1334: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1336: CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337: if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338: if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339: if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341: if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342: if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343: if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344: if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1347: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1349: }
1350: if (!cusparsestruct->csr2csc_i) {
1351: THRUSTARRAY csr2csc_a(matrix->num_entries);
1352: PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1354: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356: void *csr2cscBuffer;
1357: size_t csr2cscBufferSize;
1358: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359: A->cmap->n, matrix->num_entries,
1360: matrix->values->data().get(),
1361: cusparsestruct->rowoffsets_gpu->data().get(),
1362: matrix->column_indices->data().get(),
1363: matrixT->values->data().get(),
1364: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365: CUSPARSE_ACTION_NUMERIC,indexBase,
1366: cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367: err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368: #endif
1370: if (matrix->num_entries) {
1371: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376: should be filled with indexBase. So I just take a shortcut here.
1377: */
1378: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379: A->cmap->n,matrix->num_entries,
1380: csr2csc_a.data().get(),
1381: cusparsestruct->rowoffsets_gpu->data().get(),
1382: matrix->column_indices->data().get(),
1383: matrixT->values->data().get(),
1384: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386: CUSPARSE_ACTION_NUMERIC,indexBase,
1387: cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388: #else
1389: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391: #endif
1392: } else {
1393: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394: }
1396: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397: PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399: err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400: #endif
1401: }
1402: PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403: thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404: matrixT->values->begin()));
1405: }
1406: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1407: /* the compressed row indices is not used for matTranspose */
1408: matstructT->cprowIndices = NULL;
1409: /* assign the pointer */
1410: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411: A->transupdated = PETSC_TRUE;
1412: return(0);
1413: }
1415: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417: {
1418: PetscInt n = xx->map->n;
1419: const PetscScalar *barray;
1420: PetscScalar *xarray;
1421: thrust::device_ptr<const PetscScalar> bGPU;
1422: thrust::device_ptr<PetscScalar> xGPU;
1423: cusparseStatus_t stat;
1424: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428: PetscErrorCode ierr;
1429: cudaError_t cerr;
1432: /* Analyze the matrix and create the transpose ... on the fly */
1433: if (!loTriFactorT && !upTriFactorT) {
1434: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1435: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437: }
1439: /* Get the GPU pointers */
1440: VecCUDAGetArrayWrite(xx,&xarray);
1441: VecCUDAGetArrayRead(bb,&barray);
1442: xGPU = thrust::device_pointer_cast(xarray);
1443: bGPU = thrust::device_pointer_cast(barray);
1445: PetscLogGpuTimeBegin();
1446: /* First, reorder with the row permutation */
1447: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449: xGPU);
1451: /* First, solve U */
1452: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453: upTriFactorT->csrMat->num_rows,
1454: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455: upTriFactorT->csrMat->num_entries,
1456: #endif
1457: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458: upTriFactorT->csrMat->values->data().get(),
1459: upTriFactorT->csrMat->row_offsets->data().get(),
1460: upTriFactorT->csrMat->column_indices->data().get(),
1461: upTriFactorT->solveInfo,
1462: xarray, tempGPU->data().get()
1463: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465: #endif
1466: );CHKERRCUSPARSE(stat);
1468: /* Then, solve L */
1469: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470: loTriFactorT->csrMat->num_rows,
1471: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472: loTriFactorT->csrMat->num_entries,
1473: #endif
1474: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475: loTriFactorT->csrMat->values->data().get(),
1476: loTriFactorT->csrMat->row_offsets->data().get(),
1477: loTriFactorT->csrMat->column_indices->data().get(),
1478: loTriFactorT->solveInfo,
1479: tempGPU->data().get(), xarray
1480: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482: #endif
1483: );CHKERRCUSPARSE(stat);
1485: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488: tempGPU->begin());
1490: /* Copy the temporary to the full solution. */
1491: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1493: /* restore */
1494: VecCUDARestoreArrayRead(bb,&barray);
1495: VecCUDARestoreArrayWrite(xx,&xarray);
1496: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497: PetscLogGpuTimeEnd();
1498: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1499: return(0);
1500: }
1502: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503: {
1504: const PetscScalar *barray;
1505: PetscScalar *xarray;
1506: cusparseStatus_t stat;
1507: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511: PetscErrorCode ierr;
1512: cudaError_t cerr;
1515: /* Analyze the matrix and create the transpose ... on the fly */
1516: if (!loTriFactorT && !upTriFactorT) {
1517: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1518: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520: }
1522: /* Get the GPU pointers */
1523: VecCUDAGetArrayWrite(xx,&xarray);
1524: VecCUDAGetArrayRead(bb,&barray);
1526: PetscLogGpuTimeBegin();
1527: /* First, solve U */
1528: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529: upTriFactorT->csrMat->num_rows,
1530: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531: upTriFactorT->csrMat->num_entries,
1532: #endif
1533: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534: upTriFactorT->csrMat->values->data().get(),
1535: upTriFactorT->csrMat->row_offsets->data().get(),
1536: upTriFactorT->csrMat->column_indices->data().get(),
1537: upTriFactorT->solveInfo,
1538: barray, tempGPU->data().get()
1539: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541: #endif
1542: );CHKERRCUSPARSE(stat);
1544: /* Then, solve L */
1545: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546: loTriFactorT->csrMat->num_rows,
1547: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548: loTriFactorT->csrMat->num_entries,
1549: #endif
1550: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551: loTriFactorT->csrMat->values->data().get(),
1552: loTriFactorT->csrMat->row_offsets->data().get(),
1553: loTriFactorT->csrMat->column_indices->data().get(),
1554: loTriFactorT->solveInfo,
1555: tempGPU->data().get(), xarray
1556: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558: #endif
1559: );CHKERRCUSPARSE(stat);
1561: /* restore */
1562: VecCUDARestoreArrayRead(bb,&barray);
1563: VecCUDARestoreArrayWrite(xx,&xarray);
1564: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565: PetscLogGpuTimeEnd();
1566: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1567: return(0);
1568: }
1570: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571: {
1572: const PetscScalar *barray;
1573: PetscScalar *xarray;
1574: thrust::device_ptr<const PetscScalar> bGPU;
1575: thrust::device_ptr<PetscScalar> xGPU;
1576: cusparseStatus_t stat;
1577: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581: PetscErrorCode ierr;
1582: cudaError_t cerr;
1586: /* Get the GPU pointers */
1587: VecCUDAGetArrayWrite(xx,&xarray);
1588: VecCUDAGetArrayRead(bb,&barray);
1589: xGPU = thrust::device_pointer_cast(xarray);
1590: bGPU = thrust::device_pointer_cast(barray);
1592: PetscLogGpuTimeBegin();
1593: /* First, reorder with the row permutation */
1594: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596: tempGPU->begin());
1598: /* Next, solve L */
1599: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600: loTriFactor->csrMat->num_rows,
1601: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602: loTriFactor->csrMat->num_entries,
1603: #endif
1604: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605: loTriFactor->csrMat->values->data().get(),
1606: loTriFactor->csrMat->row_offsets->data().get(),
1607: loTriFactor->csrMat->column_indices->data().get(),
1608: loTriFactor->solveInfo,
1609: tempGPU->data().get(), xarray
1610: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612: #endif
1613: );CHKERRCUSPARSE(stat);
1615: /* Then, solve U */
1616: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617: upTriFactor->csrMat->num_rows,
1618: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619: upTriFactor->csrMat->num_entries,
1620: #endif
1621: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622: upTriFactor->csrMat->values->data().get(),
1623: upTriFactor->csrMat->row_offsets->data().get(),
1624: upTriFactor->csrMat->column_indices->data().get(),
1625: upTriFactor->solveInfo,
1626: xarray, tempGPU->data().get()
1627: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629: #endif
1630: );CHKERRCUSPARSE(stat);
1632: /* Last, reorder with the column permutation */
1633: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635: xGPU);
1637: VecCUDARestoreArrayRead(bb,&barray);
1638: VecCUDARestoreArrayWrite(xx,&xarray);
1639: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640: PetscLogGpuTimeEnd();
1641: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1642: return(0);
1643: }
1645: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646: {
1647: const PetscScalar *barray;
1648: PetscScalar *xarray;
1649: cusparseStatus_t stat;
1650: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654: PetscErrorCode ierr;
1655: cudaError_t cerr;
1658: /* Get the GPU pointers */
1659: VecCUDAGetArrayWrite(xx,&xarray);
1660: VecCUDAGetArrayRead(bb,&barray);
1662: PetscLogGpuTimeBegin();
1663: /* First, solve L */
1664: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665: loTriFactor->csrMat->num_rows,
1666: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667: loTriFactor->csrMat->num_entries,
1668: #endif
1669: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670: loTriFactor->csrMat->values->data().get(),
1671: loTriFactor->csrMat->row_offsets->data().get(),
1672: loTriFactor->csrMat->column_indices->data().get(),
1673: loTriFactor->solveInfo,
1674: barray, tempGPU->data().get()
1675: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677: #endif
1678: );CHKERRCUSPARSE(stat);
1680: /* Next, solve U */
1681: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682: upTriFactor->csrMat->num_rows,
1683: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684: upTriFactor->csrMat->num_entries,
1685: #endif
1686: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687: upTriFactor->csrMat->values->data().get(),
1688: upTriFactor->csrMat->row_offsets->data().get(),
1689: upTriFactor->csrMat->column_indices->data().get(),
1690: upTriFactor->solveInfo,
1691: tempGPU->data().get(), xarray
1692: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694: #endif
1695: );CHKERRCUSPARSE(stat);
1697: VecCUDARestoreArrayRead(bb,&barray);
1698: VecCUDARestoreArrayWrite(xx,&xarray);
1699: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700: PetscLogGpuTimeEnd();
1701: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1702: return(0);
1703: }
1705: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706: {
1707: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1708: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709: cudaError_t cerr;
1710: PetscErrorCode ierr;
1713: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714: CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1716: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1717: cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719: PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1720: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1721: A->offloadmask = PETSC_OFFLOAD_BOTH;
1722: }
1723: return(0);
1724: }
1726: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727: {
1728: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1732: MatSeqAIJCUSPARSECopyFromGPU(A);
1733: *array = a->a;
1734: A->offloadmask = PETSC_OFFLOAD_CPU;
1735: return(0);
1736: }
1738: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739: {
1740: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1743: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1744: PetscErrorCode ierr;
1745: cusparseStatus_t stat;
1746: PetscBool both = PETSC_TRUE;
1747: cudaError_t err;
1750: if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753: CsrMatrix *matrix;
1754: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1756: if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1758: matrix->values->assign(a->a, a->a+a->nz);
1759: err = WaitForCUDA();CHKERRCUDA(err);
1760: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1761: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1762: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1763: } else {
1764: PetscInt nnz;
1765: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1766: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1767: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1768: delete cusparsestruct->workVector;
1769: delete cusparsestruct->rowoffsets_gpu;
1770: cusparsestruct->workVector = NULL;
1771: cusparsestruct->rowoffsets_gpu = NULL;
1772: try {
1773: if (a->compressedrow.use) {
1774: m = a->compressedrow.nrows;
1775: ii = a->compressedrow.i;
1776: ridx = a->compressedrow.rindex;
1777: } else {
1778: m = A->rmap->n;
1779: ii = a->i;
1780: ridx = NULL;
1781: }
1782: if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783: if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784: if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785: else nnz = a->nz;
1787: /* create cusparse matrix */
1788: cusparsestruct->nrows = m;
1789: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1794: err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1802: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804: /* set the matrix */
1805: CsrMatrix *mat= new CsrMatrix;
1806: mat->num_rows = m;
1807: mat->num_cols = A->cmap->n;
1808: mat->num_entries = nnz;
1809: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810: mat->row_offsets->assign(ii, ii + m+1);
1812: mat->column_indices = new THRUSTINTARRAY32(nnz);
1813: mat->column_indices->assign(a->j, a->j+nnz);
1815: mat->values = new THRUSTARRAY(nnz);
1816: if (a->a) mat->values->assign(a->a, a->a+nnz);
1818: /* assign the pointer */
1819: matstruct->mat = mat;
1820: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822: stat = cusparseCreateCsr(&matstruct->matDescr,
1823: mat->num_rows, mat->num_cols, mat->num_entries,
1824: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825: mat->values->data().get(),
1826: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828: }
1829: #endif
1830: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833: #else
1834: CsrMatrix *mat= new CsrMatrix;
1835: mat->num_rows = m;
1836: mat->num_cols = A->cmap->n;
1837: mat->num_entries = nnz;
1838: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839: mat->row_offsets->assign(ii, ii + m+1);
1841: mat->column_indices = new THRUSTINTARRAY32(nnz);
1842: mat->column_indices->assign(a->j, a->j+nnz);
1844: mat->values = new THRUSTARRAY(nnz);
1845: if (a->a) mat->values->assign(a->a, a->a+nnz);
1847: cusparseHybMat_t hybMat;
1848: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852: matstruct->descr, mat->values->data().get(),
1853: mat->row_offsets->data().get(),
1854: mat->column_indices->data().get(),
1855: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856: /* assign the pointer */
1857: matstruct->mat = hybMat;
1859: if (mat) {
1860: if (mat->values) delete (THRUSTARRAY*)mat->values;
1861: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863: delete (CsrMatrix*)mat;
1864: }
1865: #endif
1866: }
1868: /* assign the compressed row indices */
1869: if (a->compressedrow.use) {
1870: cusparsestruct->workVector = new THRUSTARRAY(m);
1871: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1872: matstruct->cprowIndices->assign(ridx,ridx+m);
1873: tmp = m;
1874: } else {
1875: cusparsestruct->workVector = NULL;
1876: matstruct->cprowIndices = NULL;
1877: tmp = 0;
1878: }
1879: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1881: /* assign the pointer */
1882: cusparsestruct->mat = matstruct;
1883: } catch(char *ex) {
1884: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885: }
1886: err = WaitForCUDA();CHKERRCUDA(err);
1887: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1888: cusparsestruct->nonzerostate = A->nonzerostate;
1889: }
1890: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891: }
1892: return(0);
1893: }
1895: struct VecCUDAPlusEquals
1896: {
1897: template <typename Tuple>
1898: __host__ __device__
1899: void operator()(Tuple t)
1900: {
1901: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902: }
1903: };
1905: struct VecCUDAEquals
1906: {
1907: template <typename Tuple>
1908: __host__ __device__
1909: void operator()(Tuple t)
1910: {
1911: thrust::get<1>(t) = thrust::get<0>(t);
1912: }
1913: };
1915: struct VecCUDAEqualsReverse
1916: {
1917: template <typename Tuple>
1918: __host__ __device__
1919: void operator()(Tuple t)
1920: {
1921: thrust::get<0>(t) = thrust::get<1>(t);
1922: }
1923: };
1925: struct MatMatCusparse {
1926: PetscBool cisdense;
1927: PetscScalar *Bt;
1928: Mat X;
1929: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930: PetscLogDouble flops;
1931: CsrMatrix *Bcsr;
1932: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933: cusparseSpMatDescr_t matSpBDescr;
1934: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1935: cusparseDnMatDescr_t matBDescr;
1936: cusparseDnMatDescr_t matCDescr;
1937: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938: size_t mmBufferSize;
1939: void *mmBuffer;
1940: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941: cusparseSpGEMMDescr_t spgemmDesc;
1942: #endif
1943: };
1945: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946: {
1947: PetscErrorCode ierr;
1948: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1949: cudaError_t cerr;
1950: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951: cusparseStatus_t stat;
1952: #endif
1955: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956: delete mmdata->Bcsr;
1957: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958: if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959: if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960: if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961: if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962: if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963: if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964: #endif
1965: MatDestroy(&mmdata->X);
1966: PetscFree(data);
1967: return(0);
1968: }
1970: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1972: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973: {
1974: Mat_Product *product = C->product;
1975: Mat A,B;
1976: PetscInt m,n,blda,clda;
1977: PetscBool flg,biscuda;
1978: Mat_SeqAIJCUSPARSE *cusp;
1979: cusparseStatus_t stat;
1980: cusparseOperation_t opA;
1981: const PetscScalar *barray;
1982: PetscScalar *carray;
1983: PetscErrorCode ierr;
1984: MatMatCusparse *mmdata;
1985: Mat_SeqAIJCUSPARSEMultStruct *mat;
1986: CsrMatrix *csrmat;
1987: cudaError_t cerr;
1990: MatCheckProduct(C,1);
1991: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992: mmdata = (MatMatCusparse*)product->data;
1993: A = product->A;
1994: B = product->B;
1995: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1996: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998: Instead of silently accepting the wrong answer, I prefer to raise the error */
1999: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000: MatSeqAIJCUSPARSECopyToGPU(A);
2001: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002: switch (product->type) {
2003: case MATPRODUCT_AB:
2004: case MATPRODUCT_PtAP:
2005: mat = cusp->mat;
2006: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007: m = A->rmap->n;
2008: n = B->cmap->n;
2009: break;
2010: case MATPRODUCT_AtB:
2011: if (!A->form_explicit_transpose) {
2012: mat = cusp->mat;
2013: opA = CUSPARSE_OPERATION_TRANSPOSE;
2014: } else {
2015: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2016: mat = cusp->matTranspose;
2017: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018: }
2019: m = A->cmap->n;
2020: n = B->cmap->n;
2021: break;
2022: case MATPRODUCT_ABt:
2023: case MATPRODUCT_RARt:
2024: mat = cusp->mat;
2025: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026: m = A->rmap->n;
2027: n = B->rmap->n;
2028: break;
2029: default:
2030: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031: }
2032: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033: csrmat = (CsrMatrix*)mat->mat;
2034: /* if the user passed a CPU matrix, copy the data to the GPU */
2035: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2036: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2037: MatDenseCUDAGetArrayRead(B,&barray);
2039: MatDenseGetLDA(B,&blda);
2040: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2042: MatDenseGetLDA(mmdata->X,&clda);
2043: } else {
2044: MatDenseCUDAGetArrayWrite(C,&carray);
2045: MatDenseGetLDA(C,&clda);
2046: }
2048: PetscLogGpuTimeBegin();
2049: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051: /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053: size_t mmBufferSize;
2054: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055: if (!mmdata->matBDescr) {
2056: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057: mmdata->Blda = blda;
2058: }
2060: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063: mmdata->Clda = clda;
2064: }
2066: if (!mat->matDescr) {
2067: stat = cusparseCreateCsr(&mat->matDescr,
2068: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070: csrmat->values->data().get(),
2071: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073: }
2074: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076: mmdata->matCDescr,cusparse_scalartype,
2077: cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079: cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080: cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081: mmdata->mmBufferSize = mmBufferSize;
2082: }
2083: mmdata->initialized = PETSC_TRUE;
2084: } else {
2085: /* to be safe, always update pointers of the mats */
2086: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089: }
2091: /* do cusparseSpMM, which supports transpose on B */
2092: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094: mmdata->matCDescr,cusparse_scalartype,
2095: cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096: #else
2097: PetscInt k;
2098: /* cusparseXcsrmm does not support transpose on B */
2099: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100: cublasHandle_t cublasv2handle;
2101: cublasStatus_t cerr;
2103: PetscCUBLASGetHandle(&cublasv2handle);
2104: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105: B->cmap->n,B->rmap->n,
2106: &PETSC_CUSPARSE_ONE ,barray,blda,
2107: &PETSC_CUSPARSE_ZERO,barray,blda,
2108: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109: blda = B->cmap->n;
2110: k = B->cmap->n;
2111: } else {
2112: k = B->rmap->n;
2113: }
2115: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117: csrmat->num_entries,mat->alpha_one,mat->descr,
2118: csrmat->values->data().get(),
2119: csrmat->row_offsets->data().get(),
2120: csrmat->column_indices->data().get(),
2121: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122: carray,clda);CHKERRCUSPARSE(stat);
2123: #endif
2124: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125: PetscLogGpuTimeEnd();
2126: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2127: MatDenseCUDARestoreArrayRead(B,&barray);
2128: if (product->type == MATPRODUCT_RARt) {
2129: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2130: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2131: } else if (product->type == MATPRODUCT_PtAP) {
2132: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2133: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2134: } else {
2135: MatDenseCUDARestoreArrayWrite(C,&carray);
2136: }
2137: if (mmdata->cisdense) {
2138: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2139: }
2140: if (!biscuda) {
2141: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2142: }
2143: return(0);
2144: }
2146: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147: {
2148: Mat_Product *product = C->product;
2149: Mat A,B;
2150: PetscInt m,n;
2151: PetscBool cisdense,flg;
2152: PetscErrorCode ierr;
2153: MatMatCusparse *mmdata;
2154: Mat_SeqAIJCUSPARSE *cusp;
2157: MatCheckProduct(C,1);
2158: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159: A = product->A;
2160: B = product->B;
2161: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2162: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165: switch (product->type) {
2166: case MATPRODUCT_AB:
2167: m = A->rmap->n;
2168: n = B->cmap->n;
2169: break;
2170: case MATPRODUCT_AtB:
2171: m = A->cmap->n;
2172: n = B->cmap->n;
2173: break;
2174: case MATPRODUCT_ABt:
2175: m = A->rmap->n;
2176: n = B->rmap->n;
2177: break;
2178: case MATPRODUCT_PtAP:
2179: m = B->cmap->n;
2180: n = B->cmap->n;
2181: break;
2182: case MATPRODUCT_RARt:
2183: m = B->rmap->n;
2184: n = B->rmap->n;
2185: break;
2186: default:
2187: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188: }
2189: MatSetSizes(C,m,n,m,n);
2190: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2192: MatSetType(C,MATSEQDENSECUDA);
2194: /* product data */
2195: PetscNew(&mmdata);
2196: mmdata->cisdense = cisdense;
2197: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201: }
2202: #endif
2203: /* for these products we need intermediate storage */
2204: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2206: MatSetType(mmdata->X,MATSEQDENSECUDA);
2207: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2209: } else {
2210: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2211: }
2212: }
2213: C->product->data = mmdata;
2214: C->product->destroy = MatDestroy_MatMatCusparse;
2216: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217: return(0);
2218: }
2220: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221: {
2222: Mat_Product *product = C->product;
2223: Mat A,B;
2224: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2225: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
2226: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2228: PetscBool flg;
2229: PetscErrorCode ierr;
2230: cusparseStatus_t stat;
2231: cudaError_t cerr;
2232: MatProductType ptype;
2233: MatMatCusparse *mmdata;
2234: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235: cusparseSpMatDescr_t BmatSpDescr;
2236: #endif
2239: MatCheckProduct(C,1);
2240: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241: PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2242: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243: mmdata = (MatMatCusparse*)C->product->data;
2244: A = product->A;
2245: B = product->B;
2246: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247: mmdata->reusesym = PETSC_FALSE;
2248: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250: Cmat = Ccusp->mat;
2251: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252: Ccsr = (CsrMatrix*)Cmat->mat;
2253: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254: goto finalize;
2255: }
2256: if (!c->nz) goto finalize;
2257: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2258: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2260: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262: if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269: MatSeqAIJCUSPARSECopyToGPU(A);
2270: MatSeqAIJCUSPARSECopyToGPU(B);
2272: ptype = product->type;
2273: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275: switch (ptype) {
2276: case MATPRODUCT_AB:
2277: Amat = Acusp->mat;
2278: Bmat = Bcusp->mat;
2279: break;
2280: case MATPRODUCT_AtB:
2281: Amat = Acusp->matTranspose;
2282: Bmat = Bcusp->mat;
2283: break;
2284: case MATPRODUCT_ABt:
2285: Amat = Acusp->mat;
2286: Bmat = Bcusp->matTranspose;
2287: break;
2288: default:
2289: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290: }
2291: Cmat = Ccusp->mat;
2292: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295: Acsr = (CsrMatrix*)Amat->mat;
2296: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297: Ccsr = (CsrMatrix*)Cmat->mat;
2298: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301: PetscLogGpuTimeBegin();
2302: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308: stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311: #else
2312: stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317: #endif
2318: PetscLogGpuFlops(mmdata->flops);
2319: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320: PetscLogGpuTimeEnd();
2321: C->offloadmask = PETSC_OFFLOAD_GPU;
2322: finalize:
2323: /* shorter version of MatAssemblyEnd_SeqAIJ */
2324: PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2325: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2326: PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2327: c->reallocs = 0;
2328: C->info.mallocs += 0;
2329: C->info.nz_unneeded = 0;
2330: C->assembled = C->was_assembled = PETSC_TRUE;
2331: C->num_ass++;
2332: return(0);
2333: }
2335: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336: {
2337: Mat_Product *product = C->product;
2338: Mat A,B;
2339: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2340: Mat_SeqAIJ *a,*b,*c;
2341: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2343: PetscInt i,j,m,n,k;
2344: PetscBool flg;
2345: PetscErrorCode ierr;
2346: cusparseStatus_t stat;
2347: cudaError_t cerr;
2348: MatProductType ptype;
2349: MatMatCusparse *mmdata;
2350: PetscLogDouble flops;
2351: PetscBool biscompressed,ciscompressed;
2352: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2354: size_t bufSize2;
2355: cusparseSpMatDescr_t BmatSpDescr;
2356: #else
2357: int cnz;
2358: #endif
2361: MatCheckProduct(C,1);
2362: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363: A = product->A;
2364: B = product->B;
2365: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2366: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2368: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369: a = (Mat_SeqAIJ*)A->data;
2370: b = (Mat_SeqAIJ*)B->data;
2371: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2376: /* product data */
2377: PetscNew(&mmdata);
2378: C->product->data = mmdata;
2379: C->product->destroy = MatDestroy_MatMatCusparse;
2381: MatSeqAIJCUSPARSECopyToGPU(A);
2382: MatSeqAIJCUSPARSECopyToGPU(B);
2383: ptype = product->type;
2384: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386: biscompressed = PETSC_FALSE;
2387: ciscompressed = PETSC_FALSE;
2388: switch (ptype) {
2389: case MATPRODUCT_AB:
2390: m = A->rmap->n;
2391: n = B->cmap->n;
2392: k = A->cmap->n;
2393: Amat = Acusp->mat;
2394: Bmat = Bcusp->mat;
2395: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397: break;
2398: case MATPRODUCT_AtB:
2399: m = A->cmap->n;
2400: n = B->cmap->n;
2401: k = A->rmap->n;
2402: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2403: Amat = Acusp->matTranspose;
2404: Bmat = Bcusp->mat;
2405: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406: break;
2407: case MATPRODUCT_ABt:
2408: m = A->rmap->n;
2409: n = B->rmap->n;
2410: k = A->cmap->n;
2411: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2412: Amat = Acusp->mat;
2413: Bmat = Bcusp->matTranspose;
2414: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415: break;
2416: default:
2417: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418: }
2420: /* create cusparse matrix */
2421: MatSetSizes(C,m,n,m,n);
2422: MatSetType(C,MATSEQAIJCUSPARSE);
2423: c = (Mat_SeqAIJ*)C->data;
2424: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2426: Ccsr = new CsrMatrix;
2428: c->compressedrow.use = ciscompressed;
2429: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430: c->compressedrow.nrows = a->compressedrow.nrows;
2431: PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2432: PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2433: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2434: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435: Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436: } else {
2437: c->compressedrow.nrows = 0;
2438: c->compressedrow.i = NULL;
2439: c->compressedrow.rindex = NULL;
2440: Ccusp->workVector = NULL;
2441: Cmat->cprowIndices = NULL;
2442: }
2443: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2444: Ccusp->mat = Cmat;
2445: Ccusp->mat->mat = Ccsr;
2446: Ccsr->num_rows = Ccusp->nrows;
2447: Ccsr->num_cols = n;
2448: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459: thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460: c->nz = 0;
2461: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462: Ccsr->values = new THRUSTARRAY(c->nz);
2463: goto finalizesym;
2464: }
2466: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468: Acsr = (CsrMatrix*)Amat->mat;
2469: if (!biscompressed) {
2470: Bcsr = (CsrMatrix*)Bmat->mat;
2471: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472: BmatSpDescr = Bmat->matDescr;
2473: #endif
2474: } else { /* we need to use row offsets for the full matrix */
2475: CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476: Bcsr = new CsrMatrix;
2477: Bcsr->num_rows = B->rmap->n;
2478: Bcsr->num_cols = cBcsr->num_cols;
2479: Bcsr->num_entries = cBcsr->num_entries;
2480: Bcsr->column_indices = cBcsr->column_indices;
2481: Bcsr->values = cBcsr->values;
2482: if (!Bcusp->rowoffsets_gpu) {
2483: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2484: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2486: }
2487: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488: mmdata->Bcsr = Bcsr;
2489: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490: if (Bcsr->num_rows && Bcsr->num_cols) {
2491: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493: Bcsr->values->data().get(),
2494: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496: }
2497: BmatSpDescr = mmdata->matSpBDescr;
2498: #endif
2499: }
2500: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502: /* precompute flops count */
2503: if (ptype == MATPRODUCT_AB) {
2504: for (i=0, flops = 0; i<A->rmap->n; i++) {
2505: const PetscInt st = a->i[i];
2506: const PetscInt en = a->i[i+1];
2507: for (j=st; j<en; j++) {
2508: const PetscInt brow = a->j[j];
2509: flops += 2.*(b->i[brow+1] - b->i[brow]);
2510: }
2511: }
2512: } else if (ptype == MATPRODUCT_AtB) {
2513: for (i=0, flops = 0; i<A->rmap->n; i++) {
2514: const PetscInt anzi = a->i[i+1] - a->i[i];
2515: const PetscInt bnzi = b->i[i+1] - b->i[i];
2516: flops += (2.*anzi)*bnzi;
2517: }
2518: } else { /* TODO */
2519: flops = 0.;
2520: }
2522: mmdata->flops = flops;
2523: PetscLogGpuTimeBegin();
2524: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527: NULL, NULL, NULL,
2528: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530: stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531: /* ask bufferSize bytes for external memory */
2532: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535: mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536: cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537: /* inspect the matrices A and B to understand the memory requirement for the next step */
2538: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541: mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542: /* ask bufferSize again bytes for external memory */
2543: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546: mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547: /* The CUSPARSE documentation is not clear, nor the API
2548: We need both buffers to perform the operations properly!
2549: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551: is stored in the descriptor! What a messy API... */
2552: cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553: /* compute the intermediate product of A * B */
2554: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558: /* get matrix C non-zero entries C_nnz1 */
2559: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560: c->nz = (PetscInt) C_nnz1;
2561: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2562: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564: Ccsr->values = new THRUSTARRAY(c->nz);
2565: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568: stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571: #else
2572: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573: stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575: Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576: Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577: Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578: c->nz = cnz;
2579: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581: Ccsr->values = new THRUSTARRAY(c->nz);
2582: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2584: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588: stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593: #endif
2594: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595: PetscLogGpuFlops(mmdata->flops);
2596: PetscLogGpuTimeEnd();
2597: finalizesym:
2598: c->singlemalloc = PETSC_FALSE;
2599: c->free_a = PETSC_TRUE;
2600: c->free_ij = PETSC_TRUE;
2601: PetscMalloc1(m+1,&c->i);
2602: PetscMalloc1(c->nz,&c->j);
2603: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604: PetscInt *d_i = c->i;
2605: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607: ii = *Ccsr->row_offsets;
2608: jj = *Ccsr->column_indices;
2609: if (ciscompressed) d_i = c->compressedrow.i;
2610: cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612: } else {
2613: PetscInt *d_i = c->i;
2614: if (ciscompressed) d_i = c->compressedrow.i;
2615: cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617: }
2618: if (ciscompressed) { /* need to expand host row offsets */
2619: PetscInt r = 0;
2620: c->i[0] = 0;
2621: for (k = 0; k < c->compressedrow.nrows; k++) {
2622: const PetscInt next = c->compressedrow.rindex[k];
2623: const PetscInt old = c->compressedrow.i[k];
2624: for (; r < next; r++) c->i[r+1] = old;
2625: }
2626: for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627: }
2628: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2629: PetscMalloc1(m,&c->ilen);
2630: PetscMalloc1(m,&c->imax);
2631: c->maxnz = c->nz;
2632: c->nonzerorowcnt = 0;
2633: c->rmax = 0;
2634: for (k = 0; k < m; k++) {
2635: const PetscInt nn = c->i[k+1] - c->i[k];
2636: c->ilen[k] = c->imax[k] = nn;
2637: c->nonzerorowcnt += (PetscInt)!!nn;
2638: c->rmax = PetscMax(c->rmax,nn);
2639: }
2640: MatMarkDiagonal_SeqAIJ(C);
2641: PetscMalloc1(c->nz,&c->a);
2642: Ccsr->num_entries = c->nz;
2644: C->nonzerostate++;
2645: PetscLayoutSetUp(C->rmap);
2646: PetscLayoutSetUp(C->cmap);
2647: Ccusp->nonzerostate = C->nonzerostate;
2648: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2649: C->preallocated = PETSC_TRUE;
2650: C->assembled = PETSC_FALSE;
2651: C->was_assembled = PETSC_FALSE;
2652: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653: mmdata->reusesym = PETSC_TRUE;
2654: C->offloadmask = PETSC_OFFLOAD_GPU;
2655: }
2656: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657: return(0);
2658: }
2660: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2662: /* handles sparse or dense B */
2663: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664: {
2665: Mat_Product *product = mat->product;
2667: PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2670: MatCheckProduct(mat,1);
2671: PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2672: if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2674: }
2675: if (product->type == MATPRODUCT_ABC) {
2676: Ciscusp = PETSC_FALSE;
2677: if (!product->C->boundtocpu) {
2678: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2679: }
2680: }
2681: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2682: PetscBool usecpu = PETSC_FALSE;
2683: switch (product->type) {
2684: case MATPRODUCT_AB:
2685: if (product->api_user) {
2686: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2687: PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2688: PetscOptionsEnd();
2689: } else {
2690: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2691: PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2692: PetscOptionsEnd();
2693: }
2694: break;
2695: case MATPRODUCT_AtB:
2696: if (product->api_user) {
2697: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2698: PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2699: PetscOptionsEnd();
2700: } else {
2701: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2702: PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2703: PetscOptionsEnd();
2704: }
2705: break;
2706: case MATPRODUCT_PtAP:
2707: if (product->api_user) {
2708: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2709: PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2710: PetscOptionsEnd();
2711: } else {
2712: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2713: PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2714: PetscOptionsEnd();
2715: }
2716: break;
2717: case MATPRODUCT_RARt:
2718: if (product->api_user) {
2719: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2720: PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2721: PetscOptionsEnd();
2722: } else {
2723: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2724: PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2725: PetscOptionsEnd();
2726: }
2727: break;
2728: case MATPRODUCT_ABC:
2729: if (product->api_user) {
2730: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2731: PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2732: PetscOptionsEnd();
2733: } else {
2734: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2735: PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2736: PetscOptionsEnd();
2737: }
2738: break;
2739: default:
2740: break;
2741: }
2742: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2743: }
2744: /* dispatch */
2745: if (isdense) {
2746: switch (product->type) {
2747: case MATPRODUCT_AB:
2748: case MATPRODUCT_AtB:
2749: case MATPRODUCT_ABt:
2750: case MATPRODUCT_PtAP:
2751: case MATPRODUCT_RARt:
2752: if (product->A->boundtocpu) {
2753: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2754: } else {
2755: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2756: }
2757: break;
2758: case MATPRODUCT_ABC:
2759: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2760: break;
2761: default:
2762: break;
2763: }
2764: } else if (Biscusp && Ciscusp) {
2765: switch (product->type) {
2766: case MATPRODUCT_AB:
2767: case MATPRODUCT_AtB:
2768: case MATPRODUCT_ABt:
2769: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2770: break;
2771: case MATPRODUCT_PtAP:
2772: case MATPRODUCT_RARt:
2773: case MATPRODUCT_ABC:
2774: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2775: break;
2776: default:
2777: break;
2778: }
2779: } else { /* fallback for AIJ */
2780: MatProductSetFromOptions_SeqAIJ(mat);
2781: }
2782: return(0);
2783: }
2785: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2786: {
2790: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2791: return(0);
2792: }
2794: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2795: {
2799: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2800: return(0);
2801: }
2803: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2804: {
2808: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2809: return(0);
2810: }
2812: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2813: {
2817: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2818: return(0);
2819: }
2821: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2822: {
2826: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2827: return(0);
2828: }
2830: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2831: {
2832: int i = blockIdx.x*blockDim.x + threadIdx.x;
2833: if (i < n) y[idx[i]] += x[i];
2834: }
2836: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2837: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2838: {
2839: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2840: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2841: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2842: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2843: PetscErrorCode ierr;
2844: cudaError_t cerr;
2845: cusparseStatus_t stat;
2846: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2847: PetscBool compressed;
2848: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849: PetscInt nx,ny;
2850: #endif
2853: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2854: if (!a->nonzerorowcnt) {
2855: if (!yy) {VecSet_SeqCUDA(zz,0);}
2856: else {VecCopy_SeqCUDA(yy,zz);}
2857: return(0);
2858: }
2859: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2860: MatSeqAIJCUSPARSECopyToGPU(A);
2861: if (!trans) {
2862: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2863: if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2864: } else {
2865: if (herm || !A->form_explicit_transpose) {
2866: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2867: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2868: } else {
2869: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2870: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2871: }
2872: }
2873: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2874: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2876: try {
2877: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2878: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2879: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
2881: PetscLogGpuTimeBegin();
2882: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2883: /* z = A x + beta y.
2884: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2885: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2886: */
2887: xptr = xarray;
2888: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2889: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2890: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2891: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2892: allocated to accommodate different uses. So we get the length info directly from mat.
2893: */
2894: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2895: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2896: nx = mat->num_cols;
2897: ny = mat->num_rows;
2898: }
2899: #endif
2900: } else {
2901: /* z = A^T x + beta y
2902: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2903: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2904: */
2905: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2906: dptr = zarray;
2907: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2908: if (compressed) { /* Scatter x to work vector */
2909: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2910: thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2911: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2912: VecCUDAEqualsReverse());
2913: }
2914: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2915: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2916: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2917: nx = mat->num_rows;
2918: ny = mat->num_cols;
2919: }
2920: #endif
2921: }
2923: /* csr_spmv does y = alpha op(A) x + beta y */
2924: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2925: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2926: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2927: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2928: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2929: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2930: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2931: matstruct->matDescr,
2932: matstruct->cuSpMV[opA].vecXDescr, beta,
2933: matstruct->cuSpMV[opA].vecYDescr,
2934: cusparse_scalartype,
2935: cusparsestruct->spmvAlg,
2936: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2937: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2939: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2940: } else {
2941: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2942: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2943: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2944: }
2946: stat = cusparseSpMV(cusparsestruct->handle, opA,
2947: matstruct->alpha_one,
2948: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2949: matstruct->cuSpMV[opA].vecXDescr,
2950: beta,
2951: matstruct->cuSpMV[opA].vecYDescr,
2952: cusparse_scalartype,
2953: cusparsestruct->spmvAlg,
2954: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2955: #else
2956: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2957: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2958: mat->num_rows, mat->num_cols,
2959: mat->num_entries, matstruct->alpha_one, matstruct->descr,
2960: mat->values->data().get(), mat->row_offsets->data().get(),
2961: mat->column_indices->data().get(), xptr, beta,
2962: dptr);CHKERRCUSPARSE(stat);
2963: #endif
2964: } else {
2965: if (cusparsestruct->nrows) {
2966: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2967: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2968: #else
2969: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2970: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2971: matstruct->alpha_one, matstruct->descr, hybMat,
2972: xptr, beta,
2973: dptr);CHKERRCUSPARSE(stat);
2974: #endif
2975: }
2976: }
2977: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2978: PetscLogGpuTimeEnd();
2980: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2981: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2982: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2983: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2984: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2985: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2986: }
2987: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2988: VecSet_SeqCUDA(zz,0);
2989: }
2991: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2992: if (compressed) {
2993: PetscLogGpuTimeBegin();
2994: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2995: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2996: prevent that. So I just add a ScatterAdd kernel.
2997: */
2998: #if 0
2999: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3000: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3001: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3002: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3003: VecCUDAPlusEquals());
3004: #else
3005: PetscInt n = matstruct->cprowIndices->size();
3006: ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3007: #endif
3008: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3009: PetscLogGpuTimeEnd();
3010: }
3011: } else {
3012: if (yy && yy != zz) {
3013: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3014: }
3015: }
3016: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3017: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3018: else {VecCUDARestoreArrayWrite(zz,&zarray);}
3019: } catch(char *ex) {
3020: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3021: }
3022: if (yy) {
3023: PetscLogGpuFlops(2.0*a->nz);
3024: } else {
3025: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3026: }
3027: return(0);
3028: }
3030: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3031: {
3035: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3036: return(0);
3037: }
3039: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3040: {
3041: PetscErrorCode ierr;
3042: PetscSplitCSRDataStructure *d_mat = NULL;
3044: if (A->factortype == MAT_FACTOR_NONE) {
3045: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046: }
3047: MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
3048: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
3049: if (d_mat) {
3050: A->offloadmask = PETSC_OFFLOAD_GPU;
3051: }
3053: return(0);
3054: }
3056: /* --------------------------------------------------------------------------------*/
3057: /*@
3058: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3059: (the default parallel PETSc format). This matrix will ultimately pushed down
3060: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3061: assembly performance the user should preallocate the matrix storage by setting
3062: the parameter nz (or the array nnz). By setting these parameters accurately,
3063: performance during matrix assembly can be increased by more than a factor of 50.
3065: Collective
3067: Input Parameters:
3068: + comm - MPI communicator, set to PETSC_COMM_SELF
3069: . m - number of rows
3070: . n - number of columns
3071: . nz - number of nonzeros per row (same for all rows)
3072: - nnz - array containing the number of nonzeros in the various rows
3073: (possibly different for each row) or NULL
3075: Output Parameter:
3076: . A - the matrix
3078: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3079: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3080: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3082: Notes:
3083: If nnz is given then nz is ignored
3085: The AIJ format (also called the Yale sparse matrix format or
3086: compressed row storage), is fully compatible with standard Fortran 77
3087: storage. That is, the stored row and column indices can begin at
3088: either one (as in Fortran) or zero. See the users' manual for details.
3090: Specify the preallocated storage with either nz or nnz (not both).
3091: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3092: allocation. For large problems you MUST preallocate memory or you
3093: will get TERRIBLE performance, see the users' manual chapter on matrices.
3095: By default, this format uses inodes (identical nodes) when possible, to
3096: improve numerical efficiency of matrix-vector products and solves. We
3097: search for consecutive rows with the same nonzero structure, thereby
3098: reusing matrix information to achieve increased efficiency.
3100: Level: intermediate
3102: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3103: @*/
3104: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3105: {
3109: MatCreate(comm,A);
3110: MatSetSizes(*A,m,n,m,n);
3111: MatSetType(*A,MATSEQAIJCUSPARSE);
3112: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3113: return(0);
3114: }
3116: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3117: {
3118: PetscErrorCode ierr;
3119: PetscSplitCSRDataStructure *d_mat = NULL;
3122: if (A->factortype == MAT_FACTOR_NONE) {
3123: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3124: ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3125: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3126: } else {
3127: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3128: }
3129: if (d_mat) {
3130: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3131: cudaError_t err;
3132: PetscSplitCSRDataStructure h_mat;
3133: PetscInfo(A,"Have device matrix\n");
3134: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3135: if (a->compressedrow.use) {
3136: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3137: }
3138: err = cudaFree(d_mat);CHKERRCUDA(err);
3139: }
3140: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3141: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3142: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3143: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3144: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3145: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3146: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3147: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3148: MatDestroy_SeqAIJ(A);
3149: return(0);
3150: }
3152: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3153: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3154: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3155: {
3159: MatDuplicate_SeqAIJ(A,cpvalues,B);
3160: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3161: return(0);
3162: }
3164: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3165: {
3166: PetscErrorCode ierr;
3167: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3168: Mat_SeqAIJCUSPARSE *cy;
3169: Mat_SeqAIJCUSPARSE *cx;
3170: PetscScalar *ay;
3171: const PetscScalar *ax;
3172: CsrMatrix *csry,*csrx;
3173: cudaError_t cerr;
3176: cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3177: cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3178: if (X->ops->axpy != Y->ops->axpy) {
3179: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3180: MatAXPY_SeqAIJ(Y,a,X,str);
3181: return(0);
3182: }
3183: /* if we are here, it means both matrices are bound to GPU */
3184: MatSeqAIJCUSPARSECopyToGPU(Y);
3185: MatSeqAIJCUSPARSECopyToGPU(X);
3186: if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3187: if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3188: csry = (CsrMatrix*)cy->mat->mat;
3189: csrx = (CsrMatrix*)cx->mat->mat;
3190: /* see if we can turn this into a cublas axpy */
3191: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3192: bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3193: if (eq) {
3194: eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3195: }
3196: if (eq) str = SAME_NONZERO_PATTERN;
3197: }
3198: /* spgeam is buggy with one column */
3199: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3201: if (str == SUBSET_NONZERO_PATTERN) {
3202: cusparseStatus_t stat;
3203: PetscScalar b = 1.0;
3204: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3205: size_t bufferSize;
3206: void *buffer;
3207: #endif
3209: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3210: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3211: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3212: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3213: stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3214: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3215: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3216: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3217: cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3218: PetscLogGpuTimeBegin();
3219: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3220: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3221: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3222: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3223: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3224: PetscLogGpuFlops(x->nz + y->nz);
3225: PetscLogGpuTimeEnd();
3226: cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3227: #else
3228: PetscLogGpuTimeBegin();
3229: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3230: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3231: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3232: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3233: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3234: PetscLogGpuFlops(x->nz + y->nz);
3235: PetscLogGpuTimeEnd();
3236: #endif
3237: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3238: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3239: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3240: MatSeqAIJInvalidateDiagonal(Y);
3241: } else if (str == SAME_NONZERO_PATTERN) {
3242: cublasHandle_t cublasv2handle;
3243: cublasStatus_t berr;
3244: PetscBLASInt one = 1, bnz = 1;
3246: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3247: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3248: PetscCUBLASGetHandle(&cublasv2handle);
3249: PetscBLASIntCast(x->nz,&bnz);
3250: PetscLogGpuTimeBegin();
3251: berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3252: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3253: PetscLogGpuFlops(2.0*bnz);
3254: PetscLogGpuTimeEnd();
3255: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3256: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3257: MatSeqAIJInvalidateDiagonal(Y);
3258: } else {
3259: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3260: MatAXPY_SeqAIJ(Y,a,X,str);
3261: }
3262: return(0);
3263: }
3265: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3266: {
3268: Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data;
3269: PetscScalar *ay;
3270: cudaError_t cerr;
3271: cublasHandle_t cublasv2handle;
3272: cublasStatus_t berr;
3273: PetscBLASInt one = 1, bnz = 1;
3276: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3277: PetscCUBLASGetHandle(&cublasv2handle);
3278: PetscBLASIntCast(y->nz,&bnz);
3279: PetscLogGpuTimeBegin();
3280: berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3281: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3282: PetscLogGpuFlops(bnz);
3283: PetscLogGpuTimeEnd();
3284: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3285: MatSeqAIJInvalidateDiagonal(Y);
3286: return(0);
3287: }
3289: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3290: {
3291: PetscErrorCode ierr;
3292: PetscBool both = PETSC_FALSE;
3293: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3296: if (A->factortype == MAT_FACTOR_NONE) {
3297: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3298: if (spptr->mat) {
3299: CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3300: if (matrix->values) {
3301: both = PETSC_TRUE;
3302: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3303: }
3304: }
3305: if (spptr->matTranspose) {
3306: CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3307: if (matrix->values) {
3308: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3309: }
3310: }
3311: }
3312: //MatZeroEntries_SeqAIJ(A);
3313: PetscArrayzero(a->a,a->i[A->rmap->n]);
3314: MatSeqAIJInvalidateDiagonal(A);
3315: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3316: else A->offloadmask = PETSC_OFFLOAD_CPU;
3318: return(0);
3319: }
3321: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3322: {
3323: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3327: if (A->factortype != MAT_FACTOR_NONE) return(0);
3328: if (flg) {
3329: MatSeqAIJCUSPARSECopyFromGPU(A);
3331: A->ops->scale = MatScale_SeqAIJ;
3332: A->ops->axpy = MatAXPY_SeqAIJ;
3333: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3334: A->ops->mult = MatMult_SeqAIJ;
3335: A->ops->multadd = MatMultAdd_SeqAIJ;
3336: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3337: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3338: A->ops->multhermitiantranspose = NULL;
3339: A->ops->multhermitiantransposeadd = NULL;
3340: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3341: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3342: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3343: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3344: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3345: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3346: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3347: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3348: } else {
3349: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3350: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3351: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3352: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3353: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3354: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3355: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3356: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3357: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3358: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3359: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3360: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3361: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3362: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3363: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3364: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3365: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3366: }
3367: A->boundtocpu = flg;
3368: a->inode.use = flg;
3369: return(0);
3370: }
3372: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3373: {
3374: PetscErrorCode ierr;
3375: cusparseStatus_t stat;
3376: Mat B;
3379: PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3380: if (reuse == MAT_INITIAL_MATRIX) {
3381: MatDuplicate(A,MAT_COPY_VALUES,newmat);
3382: } else if (reuse == MAT_REUSE_MATRIX) {
3383: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3384: }
3385: B = *newmat;
3387: PetscFree(B->defaultvectype);
3388: PetscStrallocpy(VECCUDA,&B->defaultvectype);
3390: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3391: if (B->factortype == MAT_FACTOR_NONE) {
3392: Mat_SeqAIJCUSPARSE *spptr;
3393: PetscNew(&spptr);
3394: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3395: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3396: spptr->format = MAT_CUSPARSE_CSR;
3397: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3398: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3399: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3400: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3401: #endif
3402: B->spptr = spptr;
3403: } else {
3404: Mat_SeqAIJCUSPARSETriFactors *spptr;
3406: PetscNew(&spptr);
3407: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3408: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3409: B->spptr = spptr;
3410: }
3411: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3412: }
3413: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3414: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3415: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3416: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3417: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3418: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3420: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3421: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3422: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3423: return(0);
3424: }
3426: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3427: {
3431: MatCreate_SeqAIJ(B);
3432: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3433: return(0);
3434: }
3436: /*MC
3437: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3439: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3440: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3441: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3443: Options Database Keys:
3444: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3445: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3446: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3448: Level: beginner
3450: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3451: M*/
3453: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3455: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3456: {
3460: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3461: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3462: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3463: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3464: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
3466: return(0);
3467: }
3469: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3470: {
3471: PetscErrorCode ierr;
3472: cusparseStatus_t stat;
3475: if (*cusparsestruct) {
3476: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3477: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3478: delete (*cusparsestruct)->workVector;
3479: delete (*cusparsestruct)->rowoffsets_gpu;
3480: delete (*cusparsestruct)->cooPerm;
3481: delete (*cusparsestruct)->cooPerm_a;
3482: delete (*cusparsestruct)->csr2csc_i;
3483: if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3484: PetscFree(*cusparsestruct);
3485: }
3486: return(0);
3487: }
3489: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3490: {
3492: if (*mat) {
3493: delete (*mat)->values;
3494: delete (*mat)->column_indices;
3495: delete (*mat)->row_offsets;
3496: delete *mat;
3497: *mat = 0;
3498: }
3499: return(0);
3500: }
3502: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3503: {
3504: cusparseStatus_t stat;
3505: PetscErrorCode ierr;
3508: if (*trifactor) {
3509: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3510: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3511: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3512: if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3513: if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3514: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3515: if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3516: #endif
3517: PetscFree(*trifactor);
3518: }
3519: return(0);
3520: }
3522: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3523: {
3524: CsrMatrix *mat;
3525: cusparseStatus_t stat;
3526: cudaError_t err;
3529: if (*matstruct) {
3530: if ((*matstruct)->mat) {
3531: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3532: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3533: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3534: #else
3535: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3536: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3537: #endif
3538: } else {
3539: mat = (CsrMatrix*)(*matstruct)->mat;
3540: CsrMatrix_Destroy(&mat);
3541: }
3542: }
3543: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3544: delete (*matstruct)->cprowIndices;
3545: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3546: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3547: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3549: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3550: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3551: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3552: for (int i=0; i<3; i++) {
3553: if (mdata->cuSpMV[i].initialized) {
3554: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3555: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3556: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3557: }
3558: }
3559: #endif
3560: delete *matstruct;
3561: *matstruct = NULL;
3562: }
3563: return(0);
3564: }
3566: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3567: {
3571: if (*trifactors) {
3572: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3573: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3574: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3575: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3576: delete (*trifactors)->rpermIndices;
3577: delete (*trifactors)->cpermIndices;
3578: delete (*trifactors)->workVector;
3579: (*trifactors)->rpermIndices = NULL;
3580: (*trifactors)->cpermIndices = NULL;
3581: (*trifactors)->workVector = NULL;
3582: if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3583: if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3584: }
3585: return(0);
3586: }
3588: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3589: {
3590: PetscErrorCode ierr;
3591: cusparseHandle_t handle;
3592: cusparseStatus_t stat;
3595: if (*trifactors) {
3596: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3597: if (handle = (*trifactors)->handle) {
3598: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3599: }
3600: PetscFree(*trifactors);
3601: }
3602: return(0);
3603: }
3605: struct IJCompare
3606: {
3607: __host__ __device__
3608: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3609: {
3610: if (t1.get<0>() < t2.get<0>()) return true;
3611: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3612: return false;
3613: }
3614: };
3616: struct IJEqual
3617: {
3618: __host__ __device__
3619: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3620: {
3621: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3622: return true;
3623: }
3624: };
3626: struct IJDiff
3627: {
3628: __host__ __device__
3629: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3630: {
3631: return t1 == t2 ? 0 : 1;
3632: }
3633: };
3635: struct IJSum
3636: {
3637: __host__ __device__
3638: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3639: {
3640: return t1||t2;
3641: }
3642: };
3644: #include <thrust/iterator/discard_iterator.h>
3645: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3646: {
3647: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3648: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3649: THRUSTARRAY *cooPerm_v = NULL;
3650: thrust::device_ptr<const PetscScalar> d_v;
3651: CsrMatrix *matrix;
3652: PetscErrorCode ierr;
3653: cudaError_t cerr;
3654: PetscInt n;
3657: if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3658: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3659: if (!cusp->cooPerm) {
3660: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3661: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3662: return(0);
3663: }
3664: matrix = (CsrMatrix*)cusp->mat->mat;
3665: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3666: if (!v) {
3667: if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3668: goto finalize;
3669: }
3670: n = cusp->cooPerm->size();
3671: if (isCudaMem(v)) {
3672: d_v = thrust::device_pointer_cast(v);
3673: } else {
3674: cooPerm_v = new THRUSTARRAY(n);
3675: cooPerm_v->assign(v,v+n);
3676: d_v = cooPerm_v->data();
3677: PetscLogCpuToGpu(n*sizeof(PetscScalar));
3678: }
3679: PetscLogGpuTimeBegin();
3680: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3681: if (cusp->cooPerm_a) {
3682: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3683: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3684: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3685: thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3686: delete cooPerm_w;
3687: } else {
3688: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3689: matrix->values->begin()));
3690: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3691: matrix->values->end()));
3692: thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3693: }
3694: } else {
3695: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3696: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3697: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3698: } else {
3699: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3700: matrix->values->begin()));
3701: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3702: matrix->values->end()));
3703: thrust::for_each(zibit,zieit,VecCUDAEquals());
3704: }
3705: }
3706: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3707: PetscLogGpuTimeEnd();
3708: finalize:
3709: delete cooPerm_v;
3710: A->offloadmask = PETSC_OFFLOAD_GPU;
3711: PetscObjectStateIncrease((PetscObject)A);
3712: /* shorter version of MatAssemblyEnd_SeqAIJ */
3713: PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3714: PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3715: PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3716: a->reallocs = 0;
3717: A->info.mallocs += 0;
3718: A->info.nz_unneeded = 0;
3719: A->assembled = A->was_assembled = PETSC_TRUE;
3720: A->num_ass++;
3721: return(0);
3722: }
3724: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3725: {
3726: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3727: PetscErrorCode ierr;
3731: if (!cusp) return(0);
3732: if (destroy) {
3733: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3734: delete cusp->csr2csc_i;
3735: cusp->csr2csc_i = NULL;
3736: }
3737: A->transupdated = PETSC_FALSE;
3738: return(0);
3739: }
3741: #include <thrust/binary_search.h>
3742: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3743: {
3744: PetscErrorCode ierr;
3745: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3746: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3747: PetscInt cooPerm_n, nzr = 0;
3748: cudaError_t cerr;
3751: PetscLayoutSetUp(A->rmap);
3752: PetscLayoutSetUp(A->cmap);
3753: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3754: if (n != cooPerm_n) {
3755: delete cusp->cooPerm;
3756: delete cusp->cooPerm_a;
3757: cusp->cooPerm = NULL;
3758: cusp->cooPerm_a = NULL;
3759: }
3760: if (n) {
3761: THRUSTINTARRAY d_i(n);
3762: THRUSTINTARRAY d_j(n);
3763: THRUSTINTARRAY ii(A->rmap->n);
3765: if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
3766: if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3768: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3769: d_i.assign(coo_i,coo_i+n);
3770: d_j.assign(coo_j,coo_j+n);
3771: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3772: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3774: PetscLogGpuTimeBegin();
3775: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3776: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3777: *cusp->cooPerm_a = d_i;
3778: THRUSTINTARRAY w = d_j;
3780: auto nekey = thrust::unique(fkey, ekey, IJEqual());
3781: if (nekey == ekey) { /* all entries are unique */
3782: delete cusp->cooPerm_a;
3783: cusp->cooPerm_a = NULL;
3784: } else { /* I couldn't come up with a more elegant algorithm */
3785: adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3786: adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3787: (*cusp->cooPerm_a)[0] = 0;
3788: w[0] = 0;
3789: thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3790: thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3791: }
3792: thrust::counting_iterator<PetscInt> search_begin(0);
3793: thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3794: search_begin, search_begin + A->rmap->n,
3795: ii.begin());
3796: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3797: PetscLogGpuTimeEnd();
3799: MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3800: a->singlemalloc = PETSC_FALSE;
3801: a->free_a = PETSC_TRUE;
3802: a->free_ij = PETSC_TRUE;
3803: PetscMalloc1(A->rmap->n+1,&a->i);
3804: a->i[0] = 0;
3805: cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3806: a->nz = a->maxnz = a->i[A->rmap->n];
3807: a->rmax = 0;
3808: PetscMalloc1(a->nz,&a->a);
3809: PetscMalloc1(a->nz,&a->j);
3810: cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3811: if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3812: if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3813: for (PetscInt i = 0; i < A->rmap->n; i++) {
3814: const PetscInt nnzr = a->i[i+1] - a->i[i];
3815: nzr += (PetscInt)!!(nnzr);
3816: a->ilen[i] = a->imax[i] = nnzr;
3817: a->rmax = PetscMax(a->rmax,nnzr);
3818: }
3819: a->nonzerorowcnt = nzr;
3820: A->preallocated = PETSC_TRUE;
3821: PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3822: MatMarkDiagonal_SeqAIJ(A);
3823: } else {
3824: MatSeqAIJSetPreallocation(A,0,NULL);
3825: }
3826: MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
3828: /* We want to allocate the CUSPARSE struct for matvec now.
3829: The code is so convoluted now that I prefer to copy zeros */
3830: PetscArrayzero(a->a,a->nz);
3831: MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3832: A->offloadmask = PETSC_OFFLOAD_CPU;
3833: A->nonzerostate++;
3834: MatSeqAIJCUSPARSECopyToGPU(A);
3835: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3837: A->assembled = PETSC_FALSE;
3838: A->was_assembled = PETSC_FALSE;
3839: return(0);
3840: }
3842: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3843: {
3844: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845: CsrMatrix *csr;
3846: PetscErrorCode ierr;
3852: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3853: MatSeqAIJCUSPARSECopyToGPU(A);
3854: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3855: csr = (CsrMatrix*)cusp->mat->mat;
3856: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3857: *a = csr->values->data().get();
3858: return(0);
3859: }
3861: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3862: {
3867: *a = NULL;
3868: return(0);
3869: }
3871: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3872: {
3873: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874: CsrMatrix *csr;
3875: PetscErrorCode ierr;
3881: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3882: MatSeqAIJCUSPARSECopyToGPU(A);
3883: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3884: csr = (CsrMatrix*)cusp->mat->mat;
3885: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3886: *a = csr->values->data().get();
3887: A->offloadmask = PETSC_OFFLOAD_GPU;
3888: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3889: return(0);
3890: }
3892: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3893: {
3900: PetscObjectStateIncrease((PetscObject)A);
3901: *a = NULL;
3902: return(0);
3903: }
3905: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3906: {
3907: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3908: CsrMatrix *csr;
3909: PetscErrorCode ierr;
3915: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3916: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3917: csr = (CsrMatrix*)cusp->mat->mat;
3918: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3919: *a = csr->values->data().get();
3920: A->offloadmask = PETSC_OFFLOAD_GPU;
3921: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3922: return(0);
3923: }
3925: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3926: {
3933: PetscObjectStateIncrease((PetscObject)A);
3934: *a = NULL;
3935: return(0);
3936: }
3938: struct IJCompare4
3939: {
3940: __host__ __device__
3941: inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3942: {
3943: if (t1.get<0>() < t2.get<0>()) return true;
3944: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3945: return false;
3946: }
3947: };
3949: struct Shift
3950: {
3951: int _shift;
3953: Shift(int shift) : _shift(shift) {}
3954: __host__ __device__
3955: inline int operator() (const int &c)
3956: {
3957: return c + _shift;
3958: }
3959: };
3961: /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3962: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3963: {
3964: PetscErrorCode ierr;
3965: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3966: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3967: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3968: CsrMatrix *Acsr,*Bcsr,*Ccsr;
3969: PetscInt Annz,Bnnz;
3970: cusparseStatus_t stat;
3971: PetscInt i,m,n,zero = 0;
3972: cudaError_t cerr;
3980: if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3981: if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3982: if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3983: if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3984: if (reuse == MAT_INITIAL_MATRIX) {
3985: m = A->rmap->n;
3986: n = A->cmap->n + B->cmap->n;
3987: MatCreate(PETSC_COMM_SELF,C);
3988: MatSetSizes(*C,m,n,m,n);
3989: MatSetType(*C,MATSEQAIJCUSPARSE);
3990: c = (Mat_SeqAIJ*)(*C)->data;
3991: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3992: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3993: Ccsr = new CsrMatrix;
3994: Cmat->cprowIndices = NULL;
3995: c->compressedrow.use = PETSC_FALSE;
3996: c->compressedrow.nrows = 0;
3997: c->compressedrow.i = NULL;
3998: c->compressedrow.rindex = NULL;
3999: Ccusp->workVector = NULL;
4000: Ccusp->nrows = m;
4001: Ccusp->mat = Cmat;
4002: Ccusp->mat->mat = Ccsr;
4003: Ccsr->num_rows = m;
4004: Ccsr->num_cols = n;
4005: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4006: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4007: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4008: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4009: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4010: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4011: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4012: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4013: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4014: MatSeqAIJCUSPARSECopyToGPU(A);
4015: MatSeqAIJCUSPARSECopyToGPU(B);
4016: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
4017: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
4018: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4019: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4021: Acsr = (CsrMatrix*)Acusp->mat->mat;
4022: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4023: Annz = (PetscInt)Acsr->column_indices->size();
4024: Bnnz = (PetscInt)Bcsr->column_indices->size();
4025: c->nz = Annz + Bnnz;
4026: Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4027: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4028: Ccsr->values = new THRUSTARRAY(c->nz);
4029: Ccsr->num_entries = c->nz;
4030: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4031: if (c->nz) {
4032: auto Acoo = new THRUSTINTARRAY32(Annz);
4033: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4034: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4035: THRUSTINTARRAY32 *Aroff,*Broff;
4037: if (a->compressedrow.use) { /* need full row offset */
4038: if (!Acusp->rowoffsets_gpu) {
4039: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4040: Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4041: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4042: }
4043: Aroff = Acusp->rowoffsets_gpu;
4044: } else Aroff = Acsr->row_offsets;
4045: if (b->compressedrow.use) { /* need full row offset */
4046: if (!Bcusp->rowoffsets_gpu) {
4047: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4048: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4049: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4050: }
4051: Broff = Bcusp->rowoffsets_gpu;
4052: } else Broff = Bcsr->row_offsets;
4053: PetscLogGpuTimeBegin();
4054: stat = cusparseXcsr2coo(Acusp->handle,
4055: Aroff->data().get(),
4056: Annz,
4057: m,
4058: Acoo->data().get(),
4059: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4060: stat = cusparseXcsr2coo(Bcusp->handle,
4061: Broff->data().get(),
4062: Bnnz,
4063: m,
4064: Bcoo->data().get(),
4065: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4066: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4067: auto Aperm = thrust::make_constant_iterator(1);
4068: auto Bperm = thrust::make_constant_iterator(0);
4069: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4070: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4071: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4072: #else
4073: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4074: auto Bcib = Bcsr->column_indices->begin();
4075: auto Bcie = Bcsr->column_indices->end();
4076: thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4077: #endif
4078: auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4079: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4080: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4081: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4082: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4083: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4084: auto p1 = Ccusp->cooPerm->begin();
4085: auto p2 = Ccusp->cooPerm->begin();
4086: thrust::advance(p2,Annz);
4087: PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4088: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4089: thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4090: #endif
4091: auto cci = thrust::make_counting_iterator(zero);
4092: auto cce = thrust::make_counting_iterator(c->nz);
4093: #if 0 //Errors on SUMMIT cuda 11.1.0
4094: PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4095: #else
4096: auto pred = thrust::identity<int>();
4097: PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4098: PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4099: #endif
4100: stat = cusparseXcoo2csr(Ccusp->handle,
4101: Ccoo->data().get(),
4102: c->nz,
4103: m,
4104: Ccsr->row_offsets->data().get(),
4105: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4106: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4107: PetscLogGpuTimeEnd();
4108: delete wPerm;
4109: delete Acoo;
4110: delete Bcoo;
4111: delete Ccoo;
4112: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4113: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4114: Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4115: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4116: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4117: #endif
4118: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4119: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4120: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4121: CsrMatrix *CcsrT = new CsrMatrix;
4122: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4123: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4125: (*C)->form_explicit_transpose = PETSC_TRUE;
4126: (*C)->transupdated = PETSC_TRUE;
4127: Ccusp->rowoffsets_gpu = NULL;
4128: CmatT->cprowIndices = NULL;
4129: CmatT->mat = CcsrT;
4130: CcsrT->num_rows = n;
4131: CcsrT->num_cols = m;
4132: CcsrT->num_entries = c->nz;
4134: CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4135: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4136: CcsrT->values = new THRUSTARRAY(c->nz);
4138: PetscLogGpuTimeBegin();
4139: auto rT = CcsrT->row_offsets->begin();
4140: if (AT) {
4141: rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4142: thrust::advance(rT,-1);
4143: }
4144: if (BT) {
4145: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4146: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4147: thrust::copy(titb,tite,rT);
4148: }
4149: auto cT = CcsrT->column_indices->begin();
4150: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4151: if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4152: auto vT = CcsrT->values->begin();
4153: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4154: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4155: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4156: PetscLogGpuTimeEnd();
4158: stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4159: stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4160: stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4161: cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4162: cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4163: cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4164: cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4165: cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4166: cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4167: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4168: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4169: CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4170: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4171: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4172: #endif
4173: Ccusp->matTranspose = CmatT;
4174: }
4175: }
4177: c->singlemalloc = PETSC_FALSE;
4178: c->free_a = PETSC_TRUE;
4179: c->free_ij = PETSC_TRUE;
4180: PetscMalloc1(m+1,&c->i);
4181: PetscMalloc1(c->nz,&c->j);
4182: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4183: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4184: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4185: ii = *Ccsr->row_offsets;
4186: jj = *Ccsr->column_indices;
4187: cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4188: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4189: } else {
4190: cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4191: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4192: }
4193: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4194: PetscMalloc1(m,&c->ilen);
4195: PetscMalloc1(m,&c->imax);
4196: c->maxnz = c->nz;
4197: c->nonzerorowcnt = 0;
4198: c->rmax = 0;
4199: for (i = 0; i < m; i++) {
4200: const PetscInt nn = c->i[i+1] - c->i[i];
4201: c->ilen[i] = c->imax[i] = nn;
4202: c->nonzerorowcnt += (PetscInt)!!nn;
4203: c->rmax = PetscMax(c->rmax,nn);
4204: }
4205: MatMarkDiagonal_SeqAIJ(*C);
4206: PetscMalloc1(c->nz,&c->a);
4207: (*C)->nonzerostate++;
4208: PetscLayoutSetUp((*C)->rmap);
4209: PetscLayoutSetUp((*C)->cmap);
4210: Ccusp->nonzerostate = (*C)->nonzerostate;
4211: (*C)->preallocated = PETSC_TRUE;
4212: } else {
4213: if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4214: c = (Mat_SeqAIJ*)(*C)->data;
4215: if (c->nz) {
4216: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4217: if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4218: if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4219: if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4220: MatSeqAIJCUSPARSECopyToGPU(A);
4221: MatSeqAIJCUSPARSECopyToGPU(B);
4222: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4223: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4224: Acsr = (CsrMatrix*)Acusp->mat->mat;
4225: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4226: Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4227: if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4228: if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4229: if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4230: if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4231: if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4232: auto pmid = Ccusp->cooPerm->begin();
4233: thrust::advance(pmid,Acsr->num_entries);
4234: PetscLogGpuTimeBegin();
4235: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4236: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4237: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4238: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4239: thrust::for_each(zibait,zieait,VecCUDAEquals());
4240: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4241: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4242: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4243: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4244: thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4245: MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4246: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4247: if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4248: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4249: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4250: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4251: CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4252: auto vT = CcsrT->values->begin();
4253: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4254: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4255: (*C)->transupdated = PETSC_TRUE;
4256: }
4257: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4258: PetscLogGpuTimeEnd();
4259: }
4260: }
4261: PetscObjectStateIncrease((PetscObject)*C);
4262: (*C)->assembled = PETSC_TRUE;
4263: (*C)->was_assembled = PETSC_FALSE;
4264: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4265: return(0);
4266: }
4268: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4269: {
4270: PetscErrorCode ierr;
4271: bool dmem;
4272: const PetscScalar *av;
4273: cudaError_t cerr;
4276: dmem = isCudaMem(v);
4277: MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4278: if (n && idx) {
4279: THRUSTINTARRAY widx(n);
4280: widx.assign(idx,idx+n);
4281: PetscLogCpuToGpu(n*sizeof(PetscInt));
4283: THRUSTARRAY *w = NULL;
4284: thrust::device_ptr<PetscScalar> dv;
4285: if (dmem) {
4286: dv = thrust::device_pointer_cast(v);
4287: } else {
4288: w = new THRUSTARRAY(n);
4289: dv = w->data();
4290: }
4291: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4293: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4294: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4295: thrust::for_each(zibit,zieit,VecCUDAEquals());
4296: if (w) {
4297: cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4298: }
4299: delete w;
4300: } else {
4301: cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4302: }
4303: if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4304: MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4305: return(0);
4306: }
4308: /*
4309: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4311: requires:
4312: structurally symmetric: fix with transpose/column meta data
4313: */
4315: /*
4316: The GPU LU factor kernel
4317: */
4318: __global__
4319: void __launch_bounds__(1024,1)
4320: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4321: {
4322: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4323: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4324: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4326: // set i (row+1)
4327: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4328: // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4329: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4330: if (rowb < end_i && threadIdx.x==0) {
4331: PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4332: bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4333: }
4334: }
4335: }
4336: // copy AIJ to AIJ_BAND
4337: __global__
4338: void __launch_bounds__(1024,1)
4339: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4340: const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4341: const int bi_csr[], PetscScalar ba_csr[])
4342: {
4343: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4344: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4345: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4347: // zero B
4348: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4349: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4350: if (rowb < end_i) {
4351: PetscScalar *batmp = ba_csr + bi_csr[rowb];
4352: const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4353: for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4354: if (j<nzb) {
4355: batmp[j] = 0;
4356: }
4357: }
4358: }
4359: }
4361: // copy A into B with CSR format -- these two loops can be fused
4362: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4363: if (rowb < end_i) {
4364: const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4365: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4366: const PetscScalar *av = aa_d + ai_d[rowa];
4367: PetscScalar *batmp = ba_csr + bi_csr[rowb];
4368: /* load in initial (unfactored row) */
4369: for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4370: if (j<nza) {
4371: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
4372: PetscScalar vala = av[j];
4373: batmp[idx] = vala;
4374: }
4375: }
4376: }
4377: }
4378: }
4379: // print AIJ_BAND
4380: __global__
4381: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4382: {
4383: // debug
4384: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4385: printf("B (AIJ) n=%d:\n",(int)n);
4386: for (int rowb=0;rowb<n;rowb++) {
4387: const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb];
4388: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4389: for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4390: printf(" bi=%d\n",bi_csr[rowb+1]);
4391: }
4392: }
4393: }
4394: // Band LU kernel --- ba_csr bi_csr
4395: __global__
4396: void __launch_bounds__(1024,1)
4397: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4398: {
4399: extern __shared__ PetscInt smemInt[];
4400: PetscInt *sm_pkIdx = &smemInt[0];
4401: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4402: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4403: const PetscInt start = field*nloc, end = start + nloc;
4404: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4405: auto g = cooperative_groups::this_grid();
4406: #endif
4407: // A22 panel update for each row A(1,:) and col A(:,1)
4408: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4409: PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4410: const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4411: const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4412: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4413: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
4414: const PetscScalar Bdd = *pBdd;
4415: const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4416: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4417: if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4418: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4419: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4420: *Aid = *Aid/Bdd;
4421: sm_pkIdx[threadIdx.y] = kIdx;
4422: }
4423: __syncthreads(); // synch on threadIdx.x only
4424: if (idx < nzUd) { /* assuming symmetric structure */
4425: PetscInt kIdx = sm_pkIdx[threadIdx.y];
4426: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4427: PetscScalar *Aij = Aid + 1;
4428: PetscScalar Lid = *Aid;
4429: for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4430: if (jIdx<nzUd) {
4431: Aij[jIdx] -= Lid*baUd[jIdx];
4432: }
4433: }
4434: }
4435: }
4436: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4437: g.sync();
4438: #else
4439: __syncthreads();
4440: #endif
4441: } /* endof for (i=0; i<n; i++) { */
4442: }
4444: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4445: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4446: {
4447: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
4448: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4449: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4450: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4451: Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4452: CsrMatrix *matrixA;
4453: PetscErrorCode ierr;
4454: cudaError_t cerr;
4455: const PetscInt n=A->rmap->n, *ic, *r;
4456: const int *ai_d, *aj_d;
4457: const PetscScalar *aa_d;
4458: PetscScalar *ba_t = cusparseTriFactors->a_band_d;
4459: int *bi_t = cusparseTriFactors->i_band_d;
4460: PetscContainer container;
4461: int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4464: if (A->rmap->n == 0) {
4465: return(0);
4466: }
4467: // cusparse setup
4468: if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4469: matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices
4470: if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4471: matrixA = (CsrMatrix*)matstructA->mat;
4472: if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4474: // factor: get Nf if available
4475: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4476: if (container) {
4477: PetscInt *pNf=NULL;
4478: PetscContainerGetPointer(container, (void **) &pNf);
4479: Nf = (*pNf)%1000;
4480: if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4481: } else Nf = 1;
4482: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4484: // get data
4485: ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4486: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4487: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
4488: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
4489: r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4491: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4492: PetscLogGpuTimeBegin();
4493: {
4494: int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4495: int gpuid;
4496: cudaDeviceProp prop;
4497: cudaGetDevice(&gpuid);
4498: cudaGetDeviceProperties(&prop, gpuid);
4499: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4500: Ni = 1/nconcurrent;
4501: Ni = 1;
4502: #else
4503: nsm = prop.multiProcessorCount;
4504: Ni = nsm/Nf/nconcurrent;
4505: #endif
4506: team_size = bw/Ni + !!(bw%Ni);
4507: nVec = PetscMin(bw, 1024/team_size);
4508: PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);
4509: {
4510: dim3 dimBlockTeam(nVec,team_size);
4511: dim3 dimBlockLeague(Nf,Ni);
4512: mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4513: CHECK_LAUNCH_ERROR(); // does a sync
4514: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515: void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4516: cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4517: #else
4518: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4519: #endif
4520: CHECK_LAUNCH_ERROR(); // does a sync
4521: #if defined(PETSC_USE_LOG)
4522: PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));
4523: #endif
4524: }
4525: }
4526: PetscLogGpuTimeEnd();
4528: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4529: B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4530: B->ops->solvetranspose = NULL; // need transpose
4531: B->ops->matsolve = NULL;
4532: B->ops->matsolvetranspose = NULL;
4534: return(0);
4535: }
4537: static PetscErrorCode MatrixNfDestroy(void *ptr)
4538: {
4539: PetscInt *nf = (PetscInt *)ptr;
4540: PetscErrorCode ierr;
4542: PetscFree(nf);
4543: return(0);
4544: }
4546: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4547: {
4548: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b;
4549: IS isicol;
4550: PetscErrorCode ierr;
4551: cudaError_t cerr;
4552: const PetscInt *ic,*ai=a->i,*aj=a->j;
4553: PetscScalar *ba_t;
4554: int *bi_t;
4555: PetscInt i,n=A->rmap->n,Nf;
4556: PetscInt nzBcsr,bwL,bwU;
4557: PetscBool missing;
4558: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4559: PetscContainer container;
4562: if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4563: MatMissingDiagonal(A,&missing,&i);
4564: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4565: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4566: MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
4567: if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4569: // factor: get Nf if available
4570: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4571: if (container) {
4572: PetscInt *pNf=NULL;
4573: PetscContainerGetPointer(container, (void **) &pNf);
4574: Nf = (*pNf)%1000;
4575: PetscContainerCreate(PETSC_COMM_SELF, &container);
4576: PetscMalloc(sizeof(PetscInt), &pNf);
4577: *pNf = Nf;
4578: PetscContainerSetPointer(container, (void *)pNf);
4579: PetscContainerSetUserDestroy(container, MatrixNfDestroy);
4580: PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
4581: PetscContainerDestroy(&container);
4582: } else Nf = 1;
4583: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4585: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
4586: ISGetIndices(isicol,&ic);
4588: MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
4589: PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
4590: b = (Mat_SeqAIJ*)(B)->data;
4592: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4593: bwL = bwU = 0;
4594: for (int rwb=0; rwb<n; rwb++) {
4595: const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4596: for (int j=0;j<anz;j++) {
4597: PetscInt colb = ic[ajtmp[j]];
4598: if (colb<rwa) { // L
4599: if (rwa-colb > bwL) bwL = rwa-colb;
4600: } else {
4601: if (colb-rwa > bwU) bwU = colb-rwa;
4602: }
4603: }
4604: }
4605: ISRestoreIndices(isicol,&ic);
4606: /* only support structurally symmetric, but it might work */
4607: if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4608: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
4609: nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4610: b->maxnz = b->nz = nzBcsr;
4611: cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4612: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4613: cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4614: cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4615: cusparseTriFactors->a_band_d = ba_t;
4616: cusparseTriFactors->i_band_d = bi_t;
4617: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
4618: PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
4619: {
4620: dim3 dimBlockTeam(1,128);
4621: dim3 dimBlockLeague(Nf,1);
4622: mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4623: }
4624: CHECK_LAUNCH_ERROR(); // does a sync
4626: // setup data
4627: if (!cusparseTriFactors->rpermIndices) {
4628: const PetscInt *r;
4630: ISGetIndices(isrow,&r);
4631: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4632: cusparseTriFactors->rpermIndices->assign(r, r+n);
4633: ISRestoreIndices(isrow,&r);
4634: PetscLogCpuToGpu(n*sizeof(PetscInt));
4635: }
4636: /* upper triangular indices */
4637: if (!cusparseTriFactors->cpermIndices) {
4638: const PetscInt *c;
4640: ISGetIndices(isicol,&c);
4641: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4642: cusparseTriFactors->cpermIndices->assign(c, c+n);
4643: ISRestoreIndices(isicol,&c);
4644: PetscLogCpuToGpu(n*sizeof(PetscInt));
4645: }
4647: /* put together the new matrix */
4648: b->free_a = PETSC_FALSE;
4649: b->free_ij = PETSC_FALSE;
4650: b->singlemalloc = PETSC_FALSE;
4651: b->ilen = NULL;
4652: b->imax = NULL;
4653: b->row = isrow;
4654: b->col = iscol;
4655: PetscObjectReference((PetscObject)isrow);
4656: PetscObjectReference((PetscObject)iscol);
4657: b->icol = isicol;
4658: PetscMalloc1(n+1,&b->solve_work);
4660: B->factortype = MAT_FACTOR_LU;
4661: B->info.factor_mallocs = 0;
4662: B->info.fill_ratio_given = 0;
4664: if (ai[n]) {
4665: B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4666: } else {
4667: B->info.fill_ratio_needed = 0.0;
4668: }
4669: #if defined(PETSC_USE_INFO)
4670: if (ai[n] != 0) {
4671: PetscReal af = B->info.fill_ratio_needed;
4672: PetscInfo1(A,"Band fill ratio %g\n",(double)af);
4673: } else {
4674: PetscInfo(A,"Empty matrix\n");
4675: }
4676: #endif
4677: if (a->inode.size) {
4678: PetscInfo(A,"Warning: using inodes in band solver.\n");
4679: }
4680: MatSeqAIJCheckInode_FactorLU(B);
4681: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4682: B->offloadmask = PETSC_OFFLOAD_GPU;
4684: return(0);
4685: }
4687: /* Use -pc_factor_mat_solver_type cusparseband */
4688: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4689: {
4691: *type = MATSOLVERCUSPARSEBAND;
4692: return(0);
4693: }
4695: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4696: {
4698: PetscInt n = A->rmap->n;
4701: MatCreate(PetscObjectComm((PetscObject)A),B);
4702: MatSetSizes(*B,n,n,n,n);
4703: (*B)->factortype = ftype;
4704: (*B)->useordering = PETSC_TRUE;
4705: MatSetType(*B,MATSEQAIJCUSPARSE);
4707: if (ftype == MAT_FACTOR_LU) {
4708: MatSetBlockSizesFromMats(*B,A,A);
4709: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4710: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4711: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4713: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
4714: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
4715: return(0);
4716: }
4718: #define WARP_SIZE 32
4719: template <typename T>
4720: __forceinline__ __device__
4721: T wreduce(T a)
4722: {
4723: T b;
4724: #pragma unroll
4725: for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4726: b = __shfl_down_sync(0xffffffff, a, i);
4727: a += b;
4728: }
4729: return a;
4730: }
4731: // reduce in a block, returns result in thread 0
4732: template <typename T, int BLOCK_SIZE>
4733: __device__
4734: T breduce(T a)
4735: {
4736: constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4737: __shared__ double buf[NWARP];
4738: int wid = threadIdx.x / WARP_SIZE;
4739: int laneid = threadIdx.x % WARP_SIZE;
4740: T b = wreduce<T>(a);
4741: if (laneid == 0)
4742: buf[wid] = b;
4743: __syncthreads();
4744: if (wid == 0) {
4745: if (threadIdx.x < NWARP)
4746: a = buf[threadIdx.x];
4747: else
4748: a = 0;
4749: for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4750: a += __shfl_down_sync(0xffffffff, a, i);
4751: }
4752: }
4753: return a;
4754: }
4757: // Band LU kernel --- ba_csr bi_csr
4758: template <int BLOCK_SIZE>
4759: __global__
4760: void __launch_bounds__(256,1)
4761: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4762: {
4763: const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4764: const PetscScalar *pLi;
4765: const int tid = threadIdx.x;
4767: /* Next, solve L */
4768: pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4769: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4770: const PetscInt col = locDD<bw ? start : (glbDD-bw);
4771: PetscScalar t = 0;
4772: for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4773: t += pLi[idx]*x[j];
4774: }
4775: #if defined(PETSC_USE_COMPLEX)
4776: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4777: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4778: t = tt;
4779: #else
4780: t = breduce<PetscReal,BLOCK_SIZE>(t);
4781: #endif
4782: if (threadIdx.x == 0)
4783: x[glbDD] -= t; // /1.0
4784: __syncthreads();
4785: // inc
4786: pLi += glbDD-col; // get to diagonal
4787: if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4788: else pLi += bw;
4789: pLi += 1; // skip to next row
4790: if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4791: }
4792: /* Then, solve U */
4793: pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4794: if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4795: for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4796: const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4797: PetscScalar t = 0;
4798: for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4799: t += pLi[-idx]*x[j];
4800: }
4801: #if defined(PETSC_USE_COMPLEX)
4802: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4803: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4804: t = tt;
4805: #else
4806: t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4807: #endif
4808: pLi -= col-glbDD; // diagonal
4809: if (threadIdx.x == 0) {
4810: x[glbDD] -= t;
4811: x[glbDD] /= pLi[0];
4812: }
4813: __syncthreads();
4814: // inc past L to start of previous U
4815: pLi -= bw+1;
4816: if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4817: if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4818: }
4819: }
4821: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4822: {
4823: const PetscScalar *barray;
4824: PetscScalar *xarray;
4825: thrust::device_ptr<const PetscScalar> bGPU;
4826: thrust::device_ptr<PetscScalar> xGPU;
4827: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4828: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4829: PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4830: PetscErrorCode ierr;
4831: cudaError_t cerr;
4832: PetscContainer container;
4835: if (A->rmap->n == 0) {
4836: return(0);
4837: }
4838: // factor: get Nf if available
4839: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4840: if (container) {
4841: PetscInt *pNf=NULL;
4842: PetscContainerGetPointer(container, (void **) &pNf);
4843: Nf = (*pNf)%1000;
4844: } else Nf = 1;
4845: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4847: /* Get the GPU pointers */
4848: VecCUDAGetArrayWrite(xx,&xarray);
4849: VecCUDAGetArrayRead(bb,&barray);
4850: xGPU = thrust::device_pointer_cast(xarray);
4851: bGPU = thrust::device_pointer_cast(barray);
4853: PetscLogGpuTimeBegin();
4854: /* First, reorder with the row permutation */
4855: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4856: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4857: tempGPU->begin());
4858: constexpr int block = 128;
4859: mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4860: CHECK_LAUNCH_ERROR(); // does a sync
4862: /* Last, reorder with the column permutation */
4863: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4864: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4865: xGPU);
4867: VecCUDARestoreArrayRead(bb,&barray);
4868: VecCUDARestoreArrayWrite(xx,&xarray);
4869: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4870: PetscLogGpuTimeEnd();
4871: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
4872: return(0);
4873: }