Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <petscconf.h>
9: #include <../src/mat/impls/aij/seq/aij.h>
10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #include <thrust/async/for_each.h>
17: #include <thrust/iterator/constant_iterator.h>
18: #include <thrust/remove.h>
19: #include <thrust/sort.h>
20: #include <thrust/unique.h>
22: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
27: typedef enum {
28: CUSPARSE_MV_ALG_DEFAULT = 0,
29: CUSPARSE_COOMV_ALG = 1,
30: CUSPARSE_CSRMV_ALG1 = 2,
31: CUSPARSE_CSRMV_ALG2 = 3
32: } cusparseSpMVAlg_t;
34: typedef enum {
35: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
37: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
38: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
39: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
40: CUSPARSE_SPMM_ALG_DEFAULT = 0,
41: CUSPARSE_SPMM_COO_ALG1 = 1,
42: CUSPARSE_SPMM_COO_ALG2 = 2,
43: CUSPARSE_SPMM_COO_ALG3 = 3,
44: CUSPARSE_SPMM_COO_ALG4 = 5,
45: CUSPARSE_SPMM_CSR_ALG1 = 4,
46: CUSPARSE_SPMM_CSR_ALG2 = 6,
47: } cusparseSpMMAlg_t;
49: typedef enum {
50: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
52: } cusparseCsr2CscAlg_t;
53: */
54: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57: #endif
59: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
88: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
91: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
95: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96: {
97: *type = MATSOLVERCUSPARSE;
98: return 0;
99: }
101: /*MC
102: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
103: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
104: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
105: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
106: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
107: algorithms are not recommended. This class does NOT support direct solver operations.
109: Level: beginner
111: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
112: M*/
114: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
115: {
116: PetscInt n = A->rmap->n;
118: MatCreate(PetscObjectComm((PetscObject)A),B);
119: MatSetSizes(*B,n,n,n,n);
120: (*B)->factortype = ftype;
121: MatSetType(*B,MATSEQAIJCUSPARSE);
123: if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B,PETSC_TRUE);
124: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
125: MatSetBlockSizesFromMats(*B,A,A);
126: if (!A->boundtocpu) {
127: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
128: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
129: } else {
130: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
131: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
132: }
133: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
134: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
135: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
136: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
137: if (!A->boundtocpu) {
138: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
139: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
140: } else {
141: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
142: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
143: }
144: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
145: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
146: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
148: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
149: (*B)->canuseordering = PETSC_TRUE;
150: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
151: return 0;
152: }
154: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155: {
156: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
158: switch (op) {
159: case MAT_CUSPARSE_MULT:
160: cusparsestruct->format = format;
161: break;
162: case MAT_CUSPARSE_ALL:
163: cusparsestruct->format = format;
164: break;
165: default:
166: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
167: }
168: return 0;
169: }
171: /*@
172: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
173: operation. Only the MatMult operation can use different GPU storage formats
174: for MPIAIJCUSPARSE matrices.
175: Not Collective
177: Input Parameters:
178: + A - Matrix of type SEQAIJCUSPARSE
179: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
180: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
182: Output Parameter:
184: Level: intermediate
186: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
187: @*/
188: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
189: {
191: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
192: return 0;
193: }
195: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
196: {
197: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
199: cusparsestruct->use_cpu_solve = use_cpu;
200: return 0;
201: }
203: /*@
204: MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
206: Input Parameters:
207: + A - Matrix of type SEQAIJCUSPARSE
208: - use_cpu - set flag for using the built-in CPU MatSolve
210: Output Parameter:
212: Notes:
213: The cuSparse LU solver currently computes the factors with the built-in CPU method
214: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
215: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
217: Level: intermediate
219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
222: {
224: PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
225: return 0;
226: }
228: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
229: {
230: switch (op) {
231: case MAT_FORM_EXPLICIT_TRANSPOSE:
232: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
233: if (A->form_explicit_transpose && !flg) MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
234: A->form_explicit_transpose = flg;
235: break;
236: default:
237: MatSetOption_SeqAIJ(A,op,flg);
238: break;
239: }
240: return 0;
241: }
243: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
245: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
246: {
247: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
248: IS isrow = b->row,iscol = b->col;
249: PetscBool row_identity,col_identity;
250: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
252: MatSeqAIJCUSPARSECopyFromGPU(A);
253: MatLUFactorNumeric_SeqAIJ(B,A,info);
254: B->offloadmask = PETSC_OFFLOAD_CPU;
255: /* determine which version of MatSolve needs to be used. */
256: ISIdentity(isrow,&row_identity);
257: ISIdentity(iscol,&col_identity);
258: if (row_identity && col_identity) {
259: if (!cusparsestruct->use_cpu_solve) {
260: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
261: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
262: }
263: B->ops->matsolve = NULL;
264: B->ops->matsolvetranspose = NULL;
265: } else {
266: if (!cusparsestruct->use_cpu_solve) {
267: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
268: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
269: }
270: B->ops->matsolve = NULL;
271: B->ops->matsolvetranspose = NULL;
272: }
274: /* get the triangular factors */
275: if (!cusparsestruct->use_cpu_solve) {
276: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
277: }
278: return 0;
279: }
281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283: PetscErrorCode ierr;
284: MatCUSPARSEStorageFormat format;
285: PetscBool flg;
286: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
288: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
289: if (A->factortype == MAT_FACTOR_NONE) {
290: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
291: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
292: if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
294: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
295: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
296: if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
297: PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);
298: if (flg) MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);
299: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303: #if CUSPARSE_VERSION > 11301
305: #else
307: #endif
308: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
309: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
312: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
313: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
315: #endif
316: }
317: PetscOptionsTail();
318: return 0;
319: }
321: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
322: {
323: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
325: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
326: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
327: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
328: return 0;
329: }
331: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
332: {
333: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
336: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
337: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338: return 0;
339: }
341: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342: {
343: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
346: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
347: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
348: return 0;
349: }
351: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
352: {
353: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
355: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
356: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
357: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
358: return 0;
359: }
361: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
362: {
363: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
364: PetscInt n = A->rmap->n;
365: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
366: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
367: const PetscInt *ai = a->i,*aj = a->j,*vi;
368: const MatScalar *aa = a->a,*v;
369: PetscInt *AiLo, *AjLo;
370: PetscInt i,nz, nzLower, offset, rowOffset;
372: if (!n) return 0;
373: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
374: try {
375: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
376: nzLower=n+ai[n]-ai[1];
377: if (!loTriFactor) {
378: PetscScalar *AALo;
380: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));
382: /* Allocate Space for the lower triangular matrix */
383: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));
384: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));
386: /* Fill the lower triangular matrix */
387: AiLo[0] = (PetscInt) 0;
388: AiLo[n] = nzLower;
389: AjLo[0] = (PetscInt) 0;
390: AALo[0] = (MatScalar) 1.0;
391: v = aa;
392: vi = aj;
393: offset = 1;
394: rowOffset= 1;
395: for (i=1; i<n; i++) {
396: nz = ai[i+1] - ai[i];
397: /* additional 1 for the term on the diagonal */
398: AiLo[i] = rowOffset;
399: rowOffset += nz+1;
401: PetscArraycpy(&(AjLo[offset]), vi, nz);
402: PetscArraycpy(&(AALo[offset]), v, nz);
404: offset += nz;
405: AjLo[offset] = (PetscInt) i;
406: AALo[offset] = (MatScalar) 1.0;
407: offset += 1;
409: v += nz;
410: vi += nz;
411: }
413: /* allocate space for the triangular factor information */
414: PetscNew(&loTriFactor);
415: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
416: /* Create the matrix description */
417: cusparseCreateMatDescr(&loTriFactor->descr);
418: cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
419: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
420: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
421: #else
422: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
423: #endif
424: cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);
425: cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);
427: /* set the operation */
428: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
430: /* set the matrix */
431: loTriFactor->csrMat = new CsrMatrix;
432: loTriFactor->csrMat->num_rows = n;
433: loTriFactor->csrMat->num_cols = n;
434: loTriFactor->csrMat->num_entries = nzLower;
436: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
439: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
440: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
442: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
443: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
445: /* Create the solve analysis information */
446: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
447: cusparse_create_analysis_info(&loTriFactor->solveInfo);
448: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
450: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
451: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
452: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
453: &loTriFactor->solveBufferSize));
454: cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);
455: #endif
457: /* perform the solve analysis */
458: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
459: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461: loTriFactor->csrMat->column_indices->data().get(),
462: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463: loTriFactor->solveInfo,
464: loTriFactor->solvePolicy, loTriFactor->solveBuffer));
465: #else
466: loTriFactor->solveInfo));
467: #endif
468: WaitForCUDA();
469: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
471: /* assign the pointer */
472: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
473: loTriFactor->AA_h = AALo;
474: cudaFreeHost(AiLo);
475: cudaFreeHost(AjLo);
476: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
477: } else { /* update values only */
478: if (!loTriFactor->AA_h) {
479: cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));
480: }
481: /* Fill the lower triangular matrix */
482: loTriFactor->AA_h[0] = 1.0;
483: v = aa;
484: vi = aj;
485: offset = 1;
486: for (i=1; i<n; i++) {
487: nz = ai[i+1] - ai[i];
488: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
489: offset += nz;
490: loTriFactor->AA_h[offset] = 1.0;
491: offset += 1;
492: v += nz;
493: }
494: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
495: PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
496: }
497: } catch(char *ex) {
498: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
499: }
500: }
501: return 0;
502: }
504: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
505: {
506: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
507: PetscInt n = A->rmap->n;
508: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
509: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
510: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
511: const MatScalar *aa = a->a,*v;
512: PetscInt *AiUp, *AjUp;
513: PetscInt i,nz, nzUpper, offset;
515: if (!n) return 0;
516: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
517: try {
518: /* next, figure out the number of nonzeros in the upper triangular matrix. */
519: nzUpper = adiag[0]-adiag[n];
520: if (!upTriFactor) {
521: PetscScalar *AAUp;
523: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));
525: /* Allocate Space for the upper triangular matrix */
526: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));
527: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));
529: /* Fill the upper triangular matrix */
530: AiUp[0]=(PetscInt) 0;
531: AiUp[n]=nzUpper;
532: offset = nzUpper;
533: for (i=n-1; i>=0; i--) {
534: v = aa + adiag[i+1] + 1;
535: vi = aj + adiag[i+1] + 1;
537: /* number of elements NOT on the diagonal */
538: nz = adiag[i] - adiag[i+1]-1;
540: /* decrement the offset */
541: offset -= (nz+1);
543: /* first, set the diagonal elements */
544: AjUp[offset] = (PetscInt) i;
545: AAUp[offset] = (MatScalar)1./v[nz];
546: AiUp[i] = AiUp[i+1] - (nz+1);
548: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
549: PetscArraycpy(&(AAUp[offset+1]), v, nz);
550: }
552: /* allocate space for the triangular factor information */
553: PetscNew(&upTriFactor);
554: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
556: /* Create the matrix description */
557: cusparseCreateMatDescr(&upTriFactor->descr);
558: cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
559: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
560: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
561: #else
562: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
563: #endif
564: cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
565: cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
567: /* set the operation */
568: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
570: /* set the matrix */
571: upTriFactor->csrMat = new CsrMatrix;
572: upTriFactor->csrMat->num_rows = n;
573: upTriFactor->csrMat->num_cols = n;
574: upTriFactor->csrMat->num_entries = nzUpper;
576: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
577: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
579: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
580: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
582: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
583: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
585: /* Create the solve analysis information */
586: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
587: cusparse_create_analysis_info(&upTriFactor->solveInfo);
588: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
589: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
590: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
591: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
592: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
593: &upTriFactor->solveBufferSize));
594: cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);
595: #endif
597: /* perform the solve analysis */
598: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
599: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
600: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
601: upTriFactor->csrMat->column_indices->data().get(),
602: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
603: upTriFactor->solveInfo,
604: upTriFactor->solvePolicy, upTriFactor->solveBuffer));
605: #else
606: upTriFactor->solveInfo));
607: #endif
608: WaitForCUDA();
609: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
611: /* assign the pointer */
612: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
613: upTriFactor->AA_h = AAUp;
614: cudaFreeHost(AiUp);
615: cudaFreeHost(AjUp);
616: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
617: } else {
618: if (!upTriFactor->AA_h) {
619: cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));
620: }
621: /* Fill the upper triangular matrix */
622: offset = nzUpper;
623: for (i=n-1; i>=0; i--) {
624: v = aa + adiag[i+1] + 1;
626: /* number of elements NOT on the diagonal */
627: nz = adiag[i] - adiag[i+1]-1;
629: /* decrement the offset */
630: offset -= (nz+1);
632: /* first, set the diagonal elements */
633: upTriFactor->AA_h[offset] = 1./v[nz];
634: PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
635: }
636: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
637: PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
638: }
639: } catch(char *ex) {
640: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
641: }
642: }
643: return 0;
644: }
646: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
647: {
648: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
649: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
650: IS isrow = a->row,iscol = a->icol;
651: PetscBool row_identity,col_identity;
652: PetscInt n = A->rmap->n;
655: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
656: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
658: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
659: cusparseTriFactors->nnz=a->nz;
661: A->offloadmask = PETSC_OFFLOAD_BOTH;
662: /* lower triangular indices */
663: ISIdentity(isrow,&row_identity);
664: if (!row_identity && !cusparseTriFactors->rpermIndices) {
665: const PetscInt *r;
667: ISGetIndices(isrow,&r);
668: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
669: cusparseTriFactors->rpermIndices->assign(r, r+n);
670: ISRestoreIndices(isrow,&r);
671: PetscLogCpuToGpu(n*sizeof(PetscInt));
672: }
674: /* upper triangular indices */
675: ISIdentity(iscol,&col_identity);
676: if (!col_identity && !cusparseTriFactors->cpermIndices) {
677: const PetscInt *c;
679: ISGetIndices(iscol,&c);
680: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
681: cusparseTriFactors->cpermIndices->assign(c, c+n);
682: ISRestoreIndices(iscol,&c);
683: PetscLogCpuToGpu(n*sizeof(PetscInt));
684: }
685: return 0;
686: }
688: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
689: {
690: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
691: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
692: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
693: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
694: PetscInt *AiUp, *AjUp;
695: PetscScalar *AAUp;
696: PetscScalar *AALo;
697: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
698: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
699: const PetscInt *ai = b->i,*aj = b->j,*vj;
700: const MatScalar *aa = b->a,*v;
702: if (!n) return 0;
703: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
704: try {
705: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));
706: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));
707: if (!upTriFactor && !loTriFactor) {
708: /* Allocate Space for the upper triangular matrix */
709: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));
710: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));
712: /* Fill the upper triangular matrix */
713: AiUp[0]=(PetscInt) 0;
714: AiUp[n]=nzUpper;
715: offset = 0;
716: for (i=0; i<n; i++) {
717: /* set the pointers */
718: v = aa + ai[i];
719: vj = aj + ai[i];
720: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
722: /* first, set the diagonal elements */
723: AjUp[offset] = (PetscInt) i;
724: AAUp[offset] = (MatScalar)1.0/v[nz];
725: AiUp[i] = offset;
726: AALo[offset] = (MatScalar)1.0/v[nz];
728: offset+=1;
729: if (nz>0) {
730: PetscArraycpy(&(AjUp[offset]), vj, nz);
731: PetscArraycpy(&(AAUp[offset]), v, nz);
732: for (j=offset; j<offset+nz; j++) {
733: AAUp[j] = -AAUp[j];
734: AALo[j] = AAUp[j]/v[nz];
735: }
736: offset+=nz;
737: }
738: }
740: /* allocate space for the triangular factor information */
741: PetscNew(&upTriFactor);
742: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
744: /* Create the matrix description */
745: cusparseCreateMatDescr(&upTriFactor->descr);
746: cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
747: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
748: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
749: #else
750: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
751: #endif
752: cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
753: cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);
755: /* set the matrix */
756: upTriFactor->csrMat = new CsrMatrix;
757: upTriFactor->csrMat->num_rows = A->rmap->n;
758: upTriFactor->csrMat->num_cols = A->cmap->n;
759: upTriFactor->csrMat->num_entries = a->nz;
761: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
762: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
764: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
765: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
767: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
768: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
770: /* set the operation */
771: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773: /* Create the solve analysis information */
774: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
775: cusparse_create_analysis_info(&upTriFactor->solveInfo);
776: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
778: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
779: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
780: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
781: &upTriFactor->solveBufferSize));
782: cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);
783: #endif
785: /* perform the solve analysis */
786: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
787: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
788: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
789: upTriFactor->csrMat->column_indices->data().get(),
790: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791: upTriFactor->solveInfo,
792: upTriFactor->solvePolicy, upTriFactor->solveBuffer));
793: #else
794: upTriFactor->solveInfo));
795: #endif
796: WaitForCUDA();
797: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
799: /* assign the pointer */
800: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
802: /* allocate space for the triangular factor information */
803: PetscNew(&loTriFactor);
804: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
806: /* Create the matrix description */
807: cusparseCreateMatDescr(&loTriFactor->descr);
808: cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
809: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
810: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
811: #else
812: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
813: #endif
814: cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
815: cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
817: /* set the operation */
818: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
820: /* set the matrix */
821: loTriFactor->csrMat = new CsrMatrix;
822: loTriFactor->csrMat->num_rows = A->rmap->n;
823: loTriFactor->csrMat->num_cols = A->cmap->n;
824: loTriFactor->csrMat->num_entries = a->nz;
826: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
827: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
829: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
830: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
832: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
833: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
835: /* Create the solve analysis information */
836: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
837: cusparse_create_analysis_info(&loTriFactor->solveInfo);
838: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
839: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
840: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
841: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
842: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
843: &loTriFactor->solveBufferSize));
844: cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);
845: #endif
847: /* perform the solve analysis */
848: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
849: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
850: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
851: loTriFactor->csrMat->column_indices->data().get(),
852: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
853: loTriFactor->solveInfo,
854: loTriFactor->solvePolicy, loTriFactor->solveBuffer));
855: #else
856: loTriFactor->solveInfo));
857: #endif
858: WaitForCUDA();
859: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
861: /* assign the pointer */
862: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
864: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
865: cudaFreeHost(AiUp);
866: cudaFreeHost(AjUp);
867: } else {
868: /* Fill the upper triangular matrix */
869: offset = 0;
870: for (i=0; i<n; i++) {
871: /* set the pointers */
872: v = aa + ai[i];
873: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
875: /* first, set the diagonal elements */
876: AAUp[offset] = 1.0/v[nz];
877: AALo[offset] = 1.0/v[nz];
879: offset+=1;
880: if (nz>0) {
881: PetscArraycpy(&(AAUp[offset]), v, nz);
882: for (j=offset; j<offset+nz; j++) {
883: AAUp[j] = -AAUp[j];
884: AALo[j] = AAUp[j]/v[nz];
885: }
886: offset+=nz;
887: }
888: }
891: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
892: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
893: PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
894: }
895: cudaFreeHost(AAUp);
896: cudaFreeHost(AALo);
897: } catch(char *ex) {
898: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
899: }
900: }
901: return 0;
902: }
904: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
905: {
906: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
907: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
908: IS ip = a->row;
909: PetscBool perm_identity;
910: PetscInt n = A->rmap->n;
913: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
914: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
915: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
917: A->offloadmask = PETSC_OFFLOAD_BOTH;
919: /* lower triangular indices */
920: ISIdentity(ip,&perm_identity);
921: if (!perm_identity) {
922: IS iip;
923: const PetscInt *irip,*rip;
925: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
926: ISGetIndices(iip,&irip);
927: ISGetIndices(ip,&rip);
928: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
929: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
930: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
931: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
932: ISRestoreIndices(iip,&irip);
933: ISDestroy(&iip);
934: ISRestoreIndices(ip,&rip);
935: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
936: }
937: return 0;
938: }
940: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
941: {
942: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
943: IS ip = b->row;
944: PetscBool perm_identity;
946: MatSeqAIJCUSPARSECopyFromGPU(A);
947: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
948: B->offloadmask = PETSC_OFFLOAD_CPU;
949: /* determine which version of MatSolve needs to be used. */
950: ISIdentity(ip,&perm_identity);
951: if (perm_identity) {
952: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
953: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
954: B->ops->matsolve = NULL;
955: B->ops->matsolvetranspose = NULL;
956: } else {
957: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
958: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
959: B->ops->matsolve = NULL;
960: B->ops->matsolvetranspose = NULL;
961: }
963: /* get the triangular factors */
964: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
965: return 0;
966: }
968: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
969: {
970: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
971: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
972: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
973: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
974: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
975: cusparseIndexBase_t indexBase;
976: cusparseMatrixType_t matrixType;
977: cusparseFillMode_t fillMode;
978: cusparseDiagType_t diagType;
980: /* allocate space for the transpose of the lower triangular factor */
981: PetscNew(&loTriFactorT);
982: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
984: /* set the matrix descriptors of the lower triangular factor */
985: matrixType = cusparseGetMatType(loTriFactor->descr);
986: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
987: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
988: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
989: diagType = cusparseGetMatDiagType(loTriFactor->descr);
991: /* Create the matrix description */
992: cusparseCreateMatDescr(&loTriFactorT->descr);
993: cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);
994: cusparseSetMatType(loTriFactorT->descr, matrixType);
995: cusparseSetMatFillMode(loTriFactorT->descr, fillMode);
996: cusparseSetMatDiagType(loTriFactorT->descr, diagType);
998: /* set the operation */
999: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1001: /* allocate GPU space for the CSC of the lower triangular factor*/
1002: loTriFactorT->csrMat = new CsrMatrix;
1003: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1004: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1005: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1006: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1007: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1008: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1010: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1011: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1012: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1013: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1014: loTriFactor->csrMat->values->data().get(),
1015: loTriFactor->csrMat->row_offsets->data().get(),
1016: loTriFactor->csrMat->column_indices->data().get(),
1017: loTriFactorT->csrMat->values->data().get(),
1018: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1019: CUSPARSE_ACTION_NUMERIC,indexBase,
1020: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1021: cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);
1022: #endif
1024: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1025: PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1026: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1027: loTriFactor->csrMat->values->data().get(),
1028: loTriFactor->csrMat->row_offsets->data().get(),
1029: loTriFactor->csrMat->column_indices->data().get(),
1030: loTriFactorT->csrMat->values->data().get(),
1031: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1032: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1033: CUSPARSE_ACTION_NUMERIC, indexBase,
1034: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1035: #else
1036: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1037: CUSPARSE_ACTION_NUMERIC, indexBase));
1038: #endif
1039: WaitForCUDA();
1040: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1042: /* Create the solve analysis information */
1043: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1044: cusparse_create_analysis_info(&loTriFactorT->solveInfo);
1045: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1046: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1047: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1048: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1049: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1050: &loTriFactorT->solveBufferSize));
1051: cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);
1052: #endif
1054: /* perform the solve analysis */
1055: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1056: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1057: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1058: loTriFactorT->csrMat->column_indices->data().get(),
1059: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1060: loTriFactorT->solveInfo,
1061: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1062: #else
1063: loTriFactorT->solveInfo));
1064: #endif
1065: WaitForCUDA();
1066: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1068: /* assign the pointer */
1069: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1071: /*********************************************/
1072: /* Now the Transpose of the Upper Tri Factor */
1073: /*********************************************/
1075: /* allocate space for the transpose of the upper triangular factor */
1076: PetscNew(&upTriFactorT);
1077: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1079: /* set the matrix descriptors of the upper triangular factor */
1080: matrixType = cusparseGetMatType(upTriFactor->descr);
1081: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1082: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1083: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1084: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1086: /* Create the matrix description */
1087: cusparseCreateMatDescr(&upTriFactorT->descr);
1088: cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);
1089: cusparseSetMatType(upTriFactorT->descr, matrixType);
1090: cusparseSetMatFillMode(upTriFactorT->descr, fillMode);
1091: cusparseSetMatDiagType(upTriFactorT->descr, diagType);
1093: /* set the operation */
1094: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1096: /* allocate GPU space for the CSC of the upper triangular factor*/
1097: upTriFactorT->csrMat = new CsrMatrix;
1098: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1099: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1100: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1101: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1102: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1103: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1105: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1106: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1107: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1108: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1109: upTriFactor->csrMat->values->data().get(),
1110: upTriFactor->csrMat->row_offsets->data().get(),
1111: upTriFactor->csrMat->column_indices->data().get(),
1112: upTriFactorT->csrMat->values->data().get(),
1113: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1114: CUSPARSE_ACTION_NUMERIC,indexBase,
1115: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1116: cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);
1117: #endif
1119: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1120: PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1121: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1122: upTriFactor->csrMat->values->data().get(),
1123: upTriFactor->csrMat->row_offsets->data().get(),
1124: upTriFactor->csrMat->column_indices->data().get(),
1125: upTriFactorT->csrMat->values->data().get(),
1126: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1127: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1128: CUSPARSE_ACTION_NUMERIC, indexBase,
1129: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1130: #else
1131: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1132: CUSPARSE_ACTION_NUMERIC, indexBase));
1133: #endif
1135: WaitForCUDA();
1136: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1138: /* Create the solve analysis information */
1139: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1140: cusparse_create_analysis_info(&upTriFactorT->solveInfo);
1141: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1142: PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1143: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1144: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1145: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1146: &upTriFactorT->solveBufferSize));
1147: cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);
1148: #endif
1150: /* perform the solve analysis */
1151: /* christ, would it have killed you to put this stuff in a function????????? */
1152: PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1153: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1154: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1155: upTriFactorT->csrMat->column_indices->data().get(),
1156: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1157: upTriFactorT->solveInfo,
1158: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1159: #else
1160: upTriFactorT->solveInfo));
1161: #endif
1163: WaitForCUDA();
1164: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1166: /* assign the pointer */
1167: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1168: return 0;
1169: }
1171: struct PetscScalarToPetscInt
1172: {
1173: __host__ __device__
1174: PetscInt operator()(PetscScalar s)
1175: {
1176: return (PetscInt)PetscRealPart(s);
1177: }
1178: };
1180: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1181: {
1182: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1183: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1184: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1185: cusparseStatus_t stat;
1186: cusparseIndexBase_t indexBase;
1188: MatSeqAIJCUSPARSECopyToGPU(A);
1189: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1191: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1193: if (A->transupdated) return 0;
1194: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1195: PetscLogGpuTimeBegin();
1196: if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1197: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1198: }
1199: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1200: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1201: cusparseCreateMatDescr(&matstructT->descr);
1202: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1203: cusparseSetMatIndexBase(matstructT->descr, indexBase);
1204: cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
1206: /* set alpha and beta */
1207: cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));
1208: cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));
1209: cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
1210: cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1211: cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
1212: cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1214: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1215: CsrMatrix *matrixT = new CsrMatrix;
1216: matstructT->mat = matrixT;
1217: matrixT->num_rows = A->cmap->n;
1218: matrixT->num_cols = A->rmap->n;
1219: matrixT->num_entries = a->nz;
1220: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1221: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1222: matrixT->values = new THRUSTARRAY(a->nz);
1224: if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1225: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1227: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1228: #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1229: stat = cusparseCreateCsr(&matstructT->matDescr,
1230: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1231: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1232: matrixT->values->data().get(),
1233: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1234: indexBase,cusparse_scalartype);stat;
1235: #else
1236: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1237: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1239: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1240: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1241: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1242: */
1243: if (matrixT->num_entries) {
1244: stat = cusparseCreateCsr(&matstructT->matDescr,
1245: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1246: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1247: matrixT->values->data().get(),
1248: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1249: indexBase,cusparse_scalartype);stat;
1251: } else {
1252: matstructT->matDescr = NULL;
1253: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1254: }
1255: #endif
1256: #endif
1257: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1258: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1260: #else
1261: CsrMatrix *temp = new CsrMatrix;
1262: CsrMatrix *tempT = new CsrMatrix;
1263: /* First convert HYB to CSR */
1264: temp->num_rows = A->rmap->n;
1265: temp->num_cols = A->cmap->n;
1266: temp->num_entries = a->nz;
1267: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1268: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1269: temp->values = new THRUSTARRAY(a->nz);
1271: stat = cusparse_hyb2csr(cusparsestruct->handle,
1272: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1273: temp->values->data().get(),
1274: temp->row_offsets->data().get(),
1275: temp->column_indices->data().get());stat;
1277: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1278: tempT->num_rows = A->rmap->n;
1279: tempT->num_cols = A->cmap->n;
1280: tempT->num_entries = a->nz;
1281: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1282: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1283: tempT->values = new THRUSTARRAY(a->nz);
1285: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1286: temp->num_cols, temp->num_entries,
1287: temp->values->data().get(),
1288: temp->row_offsets->data().get(),
1289: temp->column_indices->data().get(),
1290: tempT->values->data().get(),
1291: tempT->column_indices->data().get(),
1292: tempT->row_offsets->data().get(),
1293: CUSPARSE_ACTION_NUMERIC, indexBase);stat;
1295: /* Last, convert CSC to HYB */
1296: cusparseHybMat_t hybMat;
1297: cusparseCreateHybMat(&hybMat);
1298: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1299: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1300: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1301: matstructT->descr, tempT->values->data().get(),
1302: tempT->row_offsets->data().get(),
1303: tempT->column_indices->data().get(),
1304: hybMat, 0, partition);stat;
1306: /* assign the pointer */
1307: matstructT->mat = hybMat;
1308: A->transupdated = PETSC_TRUE;
1309: /* delete temporaries */
1310: if (tempT) {
1311: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1312: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1313: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1314: delete (CsrMatrix*) tempT;
1315: }
1316: if (temp) {
1317: if (temp->values) delete (THRUSTARRAY*) temp->values;
1318: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1319: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1320: delete (CsrMatrix*) temp;
1321: }
1322: #endif
1323: }
1324: }
1325: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1326: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1327: CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1336: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1337: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1338: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1339: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1340: }
1341: if (!cusparsestruct->csr2csc_i) {
1342: THRUSTARRAY csr2csc_a(matrix->num_entries);
1343: PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1345: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1346: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1347: void *csr2cscBuffer;
1348: size_t csr2cscBufferSize;
1349: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1350: A->cmap->n, matrix->num_entries,
1351: matrix->values->data().get(),
1352: cusparsestruct->rowoffsets_gpu->data().get(),
1353: matrix->column_indices->data().get(),
1354: matrixT->values->data().get(),
1355: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1356: CUSPARSE_ACTION_NUMERIC,indexBase,
1357: cusparsestruct->csr2cscAlg, &csr2cscBufferSize);stat;
1358: cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);
1359: #endif
1361: if (matrix->num_entries) {
1362: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1363: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1364: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1366: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1367: should be filled with indexBase. So I just take a shortcut here.
1368: */
1369: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1370: A->cmap->n,matrix->num_entries,
1371: csr2csc_a.data().get(),
1372: cusparsestruct->rowoffsets_gpu->data().get(),
1373: matrix->column_indices->data().get(),
1374: matrixT->values->data().get(),
1375: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1376: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1377: CUSPARSE_ACTION_NUMERIC,indexBase,
1378: cusparsestruct->csr2cscAlg, csr2cscBuffer);stat;
1379: #else
1380: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1381: CUSPARSE_ACTION_NUMERIC, indexBase);stat;
1382: #endif
1383: } else {
1384: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1385: }
1387: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1388: PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1389: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1390: cudaFree(csr2cscBuffer);
1391: #endif
1392: }
1393: PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1394: thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1395: matrixT->values->begin()));
1396: }
1397: PetscLogGpuTimeEnd();
1398: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1399: /* the compressed row indices is not used for matTranspose */
1400: matstructT->cprowIndices = NULL;
1401: /* assign the pointer */
1402: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1403: A->transupdated = PETSC_TRUE;
1404: return 0;
1405: }
1407: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1408: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1409: {
1410: PetscInt n = xx->map->n;
1411: const PetscScalar *barray;
1412: PetscScalar *xarray;
1413: thrust::device_ptr<const PetscScalar> bGPU;
1414: thrust::device_ptr<PetscScalar> xGPU;
1415: cusparseStatus_t stat;
1416: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1417: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1418: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1419: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1421: /* Analyze the matrix and create the transpose ... on the fly */
1422: if (!loTriFactorT && !upTriFactorT) {
1423: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1424: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426: }
1428: /* Get the GPU pointers */
1429: VecCUDAGetArrayWrite(xx,&xarray);
1430: VecCUDAGetArrayRead(bb,&barray);
1431: xGPU = thrust::device_pointer_cast(xarray);
1432: bGPU = thrust::device_pointer_cast(barray);
1434: PetscLogGpuTimeBegin();
1435: /* First, reorder with the row permutation */
1436: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1437: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1438: xGPU);
1440: /* First, solve U */
1441: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1442: upTriFactorT->csrMat->num_rows,
1443: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444: upTriFactorT->csrMat->num_entries,
1445: #endif
1446: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1447: upTriFactorT->csrMat->values->data().get(),
1448: upTriFactorT->csrMat->row_offsets->data().get(),
1449: upTriFactorT->csrMat->column_indices->data().get(),
1450: upTriFactorT->solveInfo,
1451: xarray,
1452: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453: tempGPU->data().get(),
1454: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);stat;
1455: #else
1456: tempGPU->data().get());stat;
1457: #endif
1459: /* Then, solve L */
1460: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1461: loTriFactorT->csrMat->num_rows,
1462: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1463: loTriFactorT->csrMat->num_entries,
1464: #endif
1465: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1466: loTriFactorT->csrMat->values->data().get(),
1467: loTriFactorT->csrMat->row_offsets->data().get(),
1468: loTriFactorT->csrMat->column_indices->data().get(),
1469: loTriFactorT->solveInfo,
1470: tempGPU->data().get(),
1471: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472: xarray,
1473: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);stat;
1474: #else
1475: xarray);stat;
1476: #endif
1478: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1479: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1480: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1481: tempGPU->begin());
1483: /* Copy the temporary to the full solution. */
1484: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1486: /* restore */
1487: VecCUDARestoreArrayRead(bb,&barray);
1488: VecCUDARestoreArrayWrite(xx,&xarray);
1489: PetscLogGpuTimeEnd();
1490: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1491: return 0;
1492: }
1494: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1495: {
1496: const PetscScalar *barray;
1497: PetscScalar *xarray;
1498: cusparseStatus_t stat;
1499: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1500: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1501: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1502: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1504: /* Analyze the matrix and create the transpose ... on the fly */
1505: if (!loTriFactorT && !upTriFactorT) {
1506: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1507: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509: }
1511: /* Get the GPU pointers */
1512: VecCUDAGetArrayWrite(xx,&xarray);
1513: VecCUDAGetArrayRead(bb,&barray);
1515: PetscLogGpuTimeBegin();
1516: /* First, solve U */
1517: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1518: upTriFactorT->csrMat->num_rows,
1519: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520: upTriFactorT->csrMat->num_entries,
1521: #endif
1522: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1523: upTriFactorT->csrMat->values->data().get(),
1524: upTriFactorT->csrMat->row_offsets->data().get(),
1525: upTriFactorT->csrMat->column_indices->data().get(),
1526: upTriFactorT->solveInfo,
1527: barray,
1528: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1529: tempGPU->data().get(),
1530: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);stat;
1531: #else
1532: tempGPU->data().get());stat;
1533: #endif
1535: /* Then, solve L */
1536: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1537: loTriFactorT->csrMat->num_rows,
1538: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539: loTriFactorT->csrMat->num_entries,
1540: #endif
1541: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1542: loTriFactorT->csrMat->values->data().get(),
1543: loTriFactorT->csrMat->row_offsets->data().get(),
1544: loTriFactorT->csrMat->column_indices->data().get(),
1545: loTriFactorT->solveInfo,
1546: tempGPU->data().get(),
1547: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548: xarray,
1549: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);stat;
1550: #else
1551: xarray);stat;
1552: #endif
1554: /* restore */
1555: VecCUDARestoreArrayRead(bb,&barray);
1556: VecCUDARestoreArrayWrite(xx,&xarray);
1557: PetscLogGpuTimeEnd();
1558: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1559: return 0;
1560: }
1562: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1563: {
1564: const PetscScalar *barray;
1565: PetscScalar *xarray;
1566: thrust::device_ptr<const PetscScalar> bGPU;
1567: thrust::device_ptr<PetscScalar> xGPU;
1568: cusparseStatus_t stat;
1569: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1570: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1571: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1572: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1575: /* Get the GPU pointers */
1576: VecCUDAGetArrayWrite(xx,&xarray);
1577: VecCUDAGetArrayRead(bb,&barray);
1578: xGPU = thrust::device_pointer_cast(xarray);
1579: bGPU = thrust::device_pointer_cast(barray);
1581: PetscLogGpuTimeBegin();
1582: /* First, reorder with the row permutation */
1583: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1584: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1585: tempGPU->begin());
1587: /* Next, solve L */
1588: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1589: loTriFactor->csrMat->num_rows,
1590: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591: loTriFactor->csrMat->num_entries,
1592: #endif
1593: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1594: loTriFactor->csrMat->values->data().get(),
1595: loTriFactor->csrMat->row_offsets->data().get(),
1596: loTriFactor->csrMat->column_indices->data().get(),
1597: loTriFactor->solveInfo,
1598: tempGPU->data().get(),
1599: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1600: xarray,
1601: loTriFactor->solvePolicy, loTriFactor->solveBuffer);stat;
1602: #else
1603: xarray);stat;
1604: #endif
1606: /* Then, solve U */
1607: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1608: upTriFactor->csrMat->num_rows,
1609: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610: upTriFactor->csrMat->num_entries,
1611: #endif
1612: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1613: upTriFactor->csrMat->values->data().get(),
1614: upTriFactor->csrMat->row_offsets->data().get(),
1615: upTriFactor->csrMat->column_indices->data().get(),
1616: upTriFactor->solveInfo,xarray,
1617: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1618: tempGPU->data().get(),
1619: upTriFactor->solvePolicy, upTriFactor->solveBuffer);stat;
1620: #else
1621: tempGPU->data().get());stat;
1622: #endif
1624: /* Last, reorder with the column permutation */
1625: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1626: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1627: xGPU);
1629: VecCUDARestoreArrayRead(bb,&barray);
1630: VecCUDARestoreArrayWrite(xx,&xarray);
1631: PetscLogGpuTimeEnd();
1632: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1633: return 0;
1634: }
1636: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1637: {
1638: const PetscScalar *barray;
1639: PetscScalar *xarray;
1640: cusparseStatus_t stat;
1641: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1642: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1643: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1644: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1646: /* Get the GPU pointers */
1647: VecCUDAGetArrayWrite(xx,&xarray);
1648: VecCUDAGetArrayRead(bb,&barray);
1650: PetscLogGpuTimeBegin();
1651: /* First, solve L */
1652: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1653: loTriFactor->csrMat->num_rows,
1654: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655: loTriFactor->csrMat->num_entries,
1656: #endif
1657: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1658: loTriFactor->csrMat->values->data().get(),
1659: loTriFactor->csrMat->row_offsets->data().get(),
1660: loTriFactor->csrMat->column_indices->data().get(),
1661: loTriFactor->solveInfo,
1662: barray,
1663: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1664: tempGPU->data().get(),
1665: loTriFactor->solvePolicy,loTriFactor->solveBuffer);stat;
1666: #else
1667: tempGPU->data().get());stat;
1668: #endif
1670: /* Next, solve U */
1671: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1672: upTriFactor->csrMat->num_rows,
1673: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1674: upTriFactor->csrMat->num_entries,
1675: #endif
1676: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1677: upTriFactor->csrMat->values->data().get(),
1678: upTriFactor->csrMat->row_offsets->data().get(),
1679: upTriFactor->csrMat->column_indices->data().get(),
1680: upTriFactor->solveInfo,
1681: tempGPU->data().get(),
1682: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1683: xarray,
1684: upTriFactor->solvePolicy, upTriFactor->solveBuffer);stat;
1685: #else
1686: xarray);stat;
1687: #endif
1689: VecCUDARestoreArrayRead(bb,&barray);
1690: VecCUDARestoreArrayWrite(xx,&xarray);
1691: PetscLogGpuTimeEnd();
1692: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1693: return 0;
1694: }
1696: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1697: {
1698: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1699: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1701: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1702: CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1704: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1705: cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1706: WaitForCUDA();
1707: PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1708: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1709: A->offloadmask = PETSC_OFFLOAD_BOTH;
1710: }
1711: return 0;
1712: }
1714: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1715: {
1716: MatSeqAIJCUSPARSECopyFromGPU(A);
1717: *array = ((Mat_SeqAIJ*)A->data)->a;
1718: return 0;
1719: }
1721: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1722: {
1723: A->offloadmask = PETSC_OFFLOAD_CPU;
1724: *array = NULL;
1725: return 0;
1726: }
1728: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1729: {
1730: MatSeqAIJCUSPARSECopyFromGPU(A);
1731: *array = ((Mat_SeqAIJ*)A->data)->a;
1732: return 0;
1733: }
1735: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1736: {
1737: *array = NULL;
1738: return 0;
1739: }
1741: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1742: {
1743: *array = ((Mat_SeqAIJ*)A->data)->a;
1744: return 0;
1745: }
1747: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1748: {
1749: A->offloadmask = PETSC_OFFLOAD_CPU;
1750: *array = NULL;
1751: return 0;
1752: }
1754: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1755: {
1756: Mat_SeqAIJCUSPARSE *cusp;
1757: CsrMatrix *matrix;
1759: MatSeqAIJCUSPARSECopyToGPU(A);
1761: cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1763: matrix = (CsrMatrix*)cusp->mat->mat;
1765: if (i) {
1766: #if !defined(PETSC_USE_64BIT_INDICES)
1767: *i = matrix->row_offsets->data().get();
1768: #else
1769: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1770: #endif
1771: }
1772: if (j) {
1773: #if !defined(PETSC_USE_64BIT_INDICES)
1774: *j = matrix->column_indices->data().get();
1775: #else
1776: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1777: #endif
1778: }
1779: if (a) *a = matrix->values->data().get();
1780: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1781: return 0;
1782: }
1784: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1785: {
1786: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1787: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1788: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1789: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1790: cusparseStatus_t stat;
1791: PetscBool both = PETSC_TRUE;
1794: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1795: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1796: CsrMatrix *matrix;
1797: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1800: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1801: matrix->values->assign(a->a, a->a+a->nz);
1802: WaitForCUDA();
1803: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1804: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1805: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1806: } else {
1807: PetscInt nnz;
1808: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1809: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1810: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1811: delete cusparsestruct->workVector;
1812: delete cusparsestruct->rowoffsets_gpu;
1813: cusparsestruct->workVector = NULL;
1814: cusparsestruct->rowoffsets_gpu = NULL;
1815: try {
1816: if (a->compressedrow.use) {
1817: m = a->compressedrow.nrows;
1818: ii = a->compressedrow.i;
1819: ridx = a->compressedrow.rindex;
1820: } else {
1821: m = A->rmap->n;
1822: ii = a->i;
1823: ridx = NULL;
1824: }
1826: if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1827: else nnz = a->nz;
1830: /* create cusparse matrix */
1831: cusparsestruct->nrows = m;
1832: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1833: cusparseCreateMatDescr(&matstruct->descr);
1834: cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);
1835: cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
1837: cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));
1838: cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));
1839: cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
1840: cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1841: cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
1842: cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1843: cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);
1845: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1846: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1847: /* set the matrix */
1848: CsrMatrix *mat= new CsrMatrix;
1849: mat->num_rows = m;
1850: mat->num_cols = A->cmap->n;
1851: mat->num_entries = nnz;
1852: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1853: mat->row_offsets->assign(ii, ii + m+1);
1855: mat->column_indices = new THRUSTINTARRAY32(nnz);
1856: mat->column_indices->assign(a->j, a->j+nnz);
1858: mat->values = new THRUSTARRAY(nnz);
1859: if (a->a) mat->values->assign(a->a, a->a+nnz);
1861: /* assign the pointer */
1862: matstruct->mat = mat;
1863: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1864: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1865: stat = cusparseCreateCsr(&matstruct->matDescr,
1866: mat->num_rows, mat->num_cols, mat->num_entries,
1867: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1868: mat->values->data().get(),
1869: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1870: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);stat;
1871: }
1872: #endif
1873: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1874: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1875: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1876: #else
1877: CsrMatrix *mat= new CsrMatrix;
1878: mat->num_rows = m;
1879: mat->num_cols = A->cmap->n;
1880: mat->num_entries = nnz;
1881: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1882: mat->row_offsets->assign(ii, ii + m+1);
1884: mat->column_indices = new THRUSTINTARRAY32(nnz);
1885: mat->column_indices->assign(a->j, a->j+nnz);
1887: mat->values = new THRUSTARRAY(nnz);
1888: if (a->a) mat->values->assign(a->a, a->a+nnz);
1890: cusparseHybMat_t hybMat;
1891: cusparseCreateHybMat(&hybMat);
1892: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1893: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1894: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1895: matstruct->descr, mat->values->data().get(),
1896: mat->row_offsets->data().get(),
1897: mat->column_indices->data().get(),
1898: hybMat, 0, partition);stat;
1899: /* assign the pointer */
1900: matstruct->mat = hybMat;
1902: if (mat) {
1903: if (mat->values) delete (THRUSTARRAY*)mat->values;
1904: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1905: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1906: delete (CsrMatrix*)mat;
1907: }
1908: #endif
1909: }
1911: /* assign the compressed row indices */
1912: if (a->compressedrow.use) {
1913: cusparsestruct->workVector = new THRUSTARRAY(m);
1914: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1915: matstruct->cprowIndices->assign(ridx,ridx+m);
1916: tmp = m;
1917: } else {
1918: cusparsestruct->workVector = NULL;
1919: matstruct->cprowIndices = NULL;
1920: tmp = 0;
1921: }
1922: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1924: /* assign the pointer */
1925: cusparsestruct->mat = matstruct;
1926: } catch(char *ex) {
1927: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1928: }
1929: WaitForCUDA();
1930: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1931: cusparsestruct->nonzerostate = A->nonzerostate;
1932: }
1933: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1934: }
1935: return 0;
1936: }
1938: struct VecCUDAPlusEquals
1939: {
1940: template <typename Tuple>
1941: __host__ __device__
1942: void operator()(Tuple t)
1943: {
1944: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1945: }
1946: };
1948: struct VecCUDAEquals
1949: {
1950: template <typename Tuple>
1951: __host__ __device__
1952: void operator()(Tuple t)
1953: {
1954: thrust::get<1>(t) = thrust::get<0>(t);
1955: }
1956: };
1958: struct VecCUDAEqualsReverse
1959: {
1960: template <typename Tuple>
1961: __host__ __device__
1962: void operator()(Tuple t)
1963: {
1964: thrust::get<0>(t) = thrust::get<1>(t);
1965: }
1966: };
1968: struct MatMatCusparse {
1969: PetscBool cisdense;
1970: PetscScalar *Bt;
1971: Mat X;
1972: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1973: PetscLogDouble flops;
1974: CsrMatrix *Bcsr;
1976: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1977: cusparseSpMatDescr_t matSpBDescr;
1978: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1979: cusparseDnMatDescr_t matBDescr;
1980: cusparseDnMatDescr_t matCDescr;
1981: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1982: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1983: void *dBuffer4;
1984: void *dBuffer5;
1985: #endif
1986: size_t mmBufferSize;
1987: void *mmBuffer;
1988: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1989: cusparseSpGEMMDescr_t spgemmDesc;
1990: #endif
1991: };
1993: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1994: {
1995: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1997: cudaFree(mmdata->Bt);
1998: delete mmdata->Bcsr;
1999: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2000: if (mmdata->matSpBDescr) cusparseDestroySpMat(mmdata->matSpBDescr);
2001: if (mmdata->matBDescr) cusparseDestroyDnMat(mmdata->matBDescr);
2002: if (mmdata->matCDescr) cusparseDestroyDnMat(mmdata->matCDescr);
2003: if (mmdata->spgemmDesc) cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2004: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2005: if (mmdata->dBuffer4) cudaFree(mmdata->dBuffer4);
2006: if (mmdata->dBuffer5) cudaFree(mmdata->dBuffer5);
2007: #endif
2008: if (mmdata->mmBuffer) cudaFree(mmdata->mmBuffer);
2009: if (mmdata->mmBuffer2) cudaFree(mmdata->mmBuffer2);
2010: #endif
2011: MatDestroy(&mmdata->X);
2012: PetscFree(data);
2013: return 0;
2014: }
2016: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2018: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2019: {
2020: Mat_Product *product = C->product;
2021: Mat A,B;
2022: PetscInt m,n,blda,clda;
2023: PetscBool flg,biscuda;
2024: Mat_SeqAIJCUSPARSE *cusp;
2025: cusparseStatus_t stat;
2026: cusparseOperation_t opA;
2027: const PetscScalar *barray;
2028: PetscScalar *carray;
2029: MatMatCusparse *mmdata;
2030: Mat_SeqAIJCUSPARSEMultStruct *mat;
2031: CsrMatrix *csrmat;
2033: MatCheckProduct(C,1);
2035: mmdata = (MatMatCusparse*)product->data;
2036: A = product->A;
2037: B = product->B;
2038: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2040: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2041: Instead of silently accepting the wrong answer, I prefer to raise the error */
2043: MatSeqAIJCUSPARSECopyToGPU(A);
2044: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2045: switch (product->type) {
2046: case MATPRODUCT_AB:
2047: case MATPRODUCT_PtAP:
2048: mat = cusp->mat;
2049: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2050: m = A->rmap->n;
2051: n = B->cmap->n;
2052: break;
2053: case MATPRODUCT_AtB:
2054: if (!A->form_explicit_transpose) {
2055: mat = cusp->mat;
2056: opA = CUSPARSE_OPERATION_TRANSPOSE;
2057: } else {
2058: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2059: mat = cusp->matTranspose;
2060: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2061: }
2062: m = A->cmap->n;
2063: n = B->cmap->n;
2064: break;
2065: case MATPRODUCT_ABt:
2066: case MATPRODUCT_RARt:
2067: mat = cusp->mat;
2068: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2069: m = A->rmap->n;
2070: n = B->rmap->n;
2071: break;
2072: default:
2073: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2074: }
2076: csrmat = (CsrMatrix*)mat->mat;
2077: /* if the user passed a CPU matrix, copy the data to the GPU */
2078: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2079: if (!biscuda) MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
2080: MatDenseCUDAGetArrayRead(B,&barray);
2082: MatDenseGetLDA(B,&blda);
2083: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2084: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2085: MatDenseGetLDA(mmdata->X,&clda);
2086: } else {
2087: MatDenseCUDAGetArrayWrite(C,&carray);
2088: MatDenseGetLDA(C,&clda);
2089: }
2091: PetscLogGpuTimeBegin();
2092: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2093: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2094: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2095: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2096: size_t mmBufferSize;
2097: if (mmdata->initialized && mmdata->Blda != blda) {cusparseDestroyDnMat(mmdata->matBDescr); mmdata->matBDescr = NULL;}
2098: if (!mmdata->matBDescr) {
2099: cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);
2100: mmdata->Blda = blda;
2101: }
2103: if (mmdata->initialized && mmdata->Clda != clda) {cusparseDestroyDnMat(mmdata->matCDescr); mmdata->matCDescr = NULL;}
2104: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2105: cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);
2106: mmdata->Clda = clda;
2107: }
2109: if (!mat->matDescr) {
2110: stat = cusparseCreateCsr(&mat->matDescr,
2111: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2112: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2113: csrmat->values->data().get(),
2114: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2115: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);stat;
2116: }
2117: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2118: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2119: mmdata->matCDescr,cusparse_scalartype,
2120: cusp->spmmAlg,&mmBufferSize);stat;
2121: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2122: cudaFree(mmdata->mmBuffer);
2123: cudaMalloc(&mmdata->mmBuffer,mmBufferSize);
2124: mmdata->mmBufferSize = mmBufferSize;
2125: }
2126: mmdata->initialized = PETSC_TRUE;
2127: } else {
2128: /* to be safe, always update pointers of the mats */
2129: cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());
2130: cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);
2131: cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);
2132: }
2134: /* do cusparseSpMM, which supports transpose on B */
2135: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2136: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2137: mmdata->matCDescr,cusparse_scalartype,
2138: cusp->spmmAlg,mmdata->mmBuffer);stat;
2139: #else
2140: PetscInt k;
2141: /* cusparseXcsrmm does not support transpose on B */
2142: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2143: cublasHandle_t cublasv2handle;
2144: cublasStatus_t cerr;
2146: PetscCUBLASGetHandle(&cublasv2handle);
2147: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2148: B->cmap->n,B->rmap->n,
2149: &PETSC_CUSPARSE_ONE ,barray,blda,
2150: &PETSC_CUSPARSE_ZERO,barray,blda,
2151: mmdata->Bt,B->cmap->n);cerr;
2152: blda = B->cmap->n;
2153: k = B->cmap->n;
2154: } else {
2155: k = B->rmap->n;
2156: }
2158: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2159: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2160: csrmat->num_entries,mat->alpha_one,mat->descr,
2161: csrmat->values->data().get(),
2162: csrmat->row_offsets->data().get(),
2163: csrmat->column_indices->data().get(),
2164: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2165: carray,clda);stat;
2166: #endif
2167: PetscLogGpuTimeEnd();
2168: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2169: MatDenseCUDARestoreArrayRead(B,&barray);
2170: if (product->type == MATPRODUCT_RARt) {
2171: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2172: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2173: } else if (product->type == MATPRODUCT_PtAP) {
2174: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2175: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2176: } else {
2177: MatDenseCUDARestoreArrayWrite(C,&carray);
2178: }
2179: if (mmdata->cisdense) {
2180: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2181: }
2182: if (!biscuda) {
2183: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2184: }
2185: return 0;
2186: }
2188: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2189: {
2190: Mat_Product *product = C->product;
2191: Mat A,B;
2192: PetscInt m,n;
2193: PetscBool cisdense,flg;
2194: MatMatCusparse *mmdata;
2195: Mat_SeqAIJCUSPARSE *cusp;
2197: MatCheckProduct(C,1);
2199: A = product->A;
2200: B = product->B;
2201: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2203: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2205: switch (product->type) {
2206: case MATPRODUCT_AB:
2207: m = A->rmap->n;
2208: n = B->cmap->n;
2209: break;
2210: case MATPRODUCT_AtB:
2211: m = A->cmap->n;
2212: n = B->cmap->n;
2213: break;
2214: case MATPRODUCT_ABt:
2215: m = A->rmap->n;
2216: n = B->rmap->n;
2217: break;
2218: case MATPRODUCT_PtAP:
2219: m = B->cmap->n;
2220: n = B->cmap->n;
2221: break;
2222: case MATPRODUCT_RARt:
2223: m = B->rmap->n;
2224: n = B->rmap->n;
2225: break;
2226: default:
2227: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2228: }
2229: MatSetSizes(C,m,n,m,n);
2230: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2231: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2232: MatSetType(C,MATSEQDENSECUDA);
2234: /* product data */
2235: PetscNew(&mmdata);
2236: mmdata->cisdense = cisdense;
2237: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2238: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2239: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2240: cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));
2241: }
2242: #endif
2243: /* for these products we need intermediate storage */
2244: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2245: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2246: MatSetType(mmdata->X,MATSEQDENSECUDA);
2247: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2248: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2249: } else {
2250: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2251: }
2252: }
2253: C->product->data = mmdata;
2254: C->product->destroy = MatDestroy_MatMatCusparse;
2256: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2257: return 0;
2258: }
2260: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2261: {
2262: Mat_Product *product = C->product;
2263: Mat A,B;
2264: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2265: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
2266: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2267: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2268: PetscBool flg;
2269: cusparseStatus_t stat;
2270: MatProductType ptype;
2271: MatMatCusparse *mmdata;
2272: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273: cusparseSpMatDescr_t BmatSpDescr;
2274: #endif
2275: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2277: MatCheckProduct(C,1);
2279: PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2281: mmdata = (MatMatCusparse*)C->product->data;
2282: A = product->A;
2283: B = product->B;
2284: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2285: mmdata->reusesym = PETSC_FALSE;
2286: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288: Cmat = Ccusp->mat;
2290: Ccsr = (CsrMatrix*)Cmat->mat;
2292: goto finalize;
2293: }
2294: if (!c->nz) goto finalize;
2295: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2297: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2301: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2302: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2303: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2307: MatSeqAIJCUSPARSECopyToGPU(A);
2308: MatSeqAIJCUSPARSECopyToGPU(B);
2310: ptype = product->type;
2311: if (A->symmetric && ptype == MATPRODUCT_AtB) {
2312: ptype = MATPRODUCT_AB;
2314: }
2315: if (B->symmetric && ptype == MATPRODUCT_ABt) {
2316: ptype = MATPRODUCT_AB;
2318: }
2319: switch (ptype) {
2320: case MATPRODUCT_AB:
2321: Amat = Acusp->mat;
2322: Bmat = Bcusp->mat;
2323: break;
2324: case MATPRODUCT_AtB:
2325: Amat = Acusp->matTranspose;
2326: Bmat = Bcusp->mat;
2327: break;
2328: case MATPRODUCT_ABt:
2329: Amat = Acusp->mat;
2330: Bmat = Bcusp->matTranspose;
2331: break;
2332: default:
2333: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2334: }
2335: Cmat = Ccusp->mat;
2339: Acsr = (CsrMatrix*)Amat->mat;
2340: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2341: Ccsr = (CsrMatrix*)Cmat->mat;
2345: PetscLogGpuTimeBegin();
2346: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2347: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2348: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2349: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2350: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2351: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2352: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2353: mmdata->spgemmDesc);stat;
2354: #else
2355: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2356: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2357: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2358: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);stat;
2359: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2360: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2361: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);stat;
2362: #endif
2363: #else
2364: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2365: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2366: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2367: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2368: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());stat;
2369: #endif
2370: PetscLogGpuFlops(mmdata->flops);
2371: WaitForCUDA();
2372: PetscLogGpuTimeEnd();
2373: C->offloadmask = PETSC_OFFLOAD_GPU;
2374: finalize:
2375: /* shorter version of MatAssemblyEnd_SeqAIJ */
2376: PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);
2377: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2378: PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);
2379: c->reallocs = 0;
2380: C->info.mallocs += 0;
2381: C->info.nz_unneeded = 0;
2382: C->assembled = C->was_assembled = PETSC_TRUE;
2383: C->num_ass++;
2384: return 0;
2385: }
2387: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2388: {
2389: Mat_Product *product = C->product;
2390: Mat A,B;
2391: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2392: Mat_SeqAIJ *a,*b,*c;
2393: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2394: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2395: PetscInt i,j,m,n,k;
2396: PetscBool flg;
2397: cusparseStatus_t stat;
2398: MatProductType ptype;
2399: MatMatCusparse *mmdata;
2400: PetscLogDouble flops;
2401: PetscBool biscompressed,ciscompressed;
2402: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2403: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2404: cusparseSpMatDescr_t BmatSpDescr;
2405: #else
2406: int cnz;
2407: #endif
2408: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2410: MatCheckProduct(C,1);
2412: A = product->A;
2413: B = product->B;
2414: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2416: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2418: a = (Mat_SeqAIJ*)A->data;
2419: b = (Mat_SeqAIJ*)B->data;
2420: /* product data */
2421: PetscNew(&mmdata);
2422: C->product->data = mmdata;
2423: C->product->destroy = MatDestroy_MatMatCusparse;
2425: MatSeqAIJCUSPARSECopyToGPU(A);
2426: MatSeqAIJCUSPARSECopyToGPU(B);
2427: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2428: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2432: ptype = product->type;
2433: if (A->symmetric && ptype == MATPRODUCT_AtB) {
2434: ptype = MATPRODUCT_AB;
2435: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2436: }
2437: if (B->symmetric && ptype == MATPRODUCT_ABt) {
2438: ptype = MATPRODUCT_AB;
2439: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2440: }
2441: biscompressed = PETSC_FALSE;
2442: ciscompressed = PETSC_FALSE;
2443: switch (ptype) {
2444: case MATPRODUCT_AB:
2445: m = A->rmap->n;
2446: n = B->cmap->n;
2447: k = A->cmap->n;
2448: Amat = Acusp->mat;
2449: Bmat = Bcusp->mat;
2450: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2451: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2452: break;
2453: case MATPRODUCT_AtB:
2454: m = A->cmap->n;
2455: n = B->cmap->n;
2456: k = A->rmap->n;
2457: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2458: Amat = Acusp->matTranspose;
2459: Bmat = Bcusp->mat;
2460: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2461: break;
2462: case MATPRODUCT_ABt:
2463: m = A->rmap->n;
2464: n = B->rmap->n;
2465: k = A->cmap->n;
2466: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2467: Amat = Acusp->mat;
2468: Bmat = Bcusp->matTranspose;
2469: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2470: break;
2471: default:
2472: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2473: }
2475: /* create cusparse matrix */
2476: MatSetSizes(C,m,n,m,n);
2477: MatSetType(C,MATSEQAIJCUSPARSE);
2478: c = (Mat_SeqAIJ*)C->data;
2479: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2480: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2481: Ccsr = new CsrMatrix;
2483: c->compressedrow.use = ciscompressed;
2484: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2485: c->compressedrow.nrows = a->compressedrow.nrows;
2486: PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2487: PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2488: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2489: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2490: Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2491: } else {
2492: c->compressedrow.nrows = 0;
2493: c->compressedrow.i = NULL;
2494: c->compressedrow.rindex = NULL;
2495: Ccusp->workVector = NULL;
2496: Cmat->cprowIndices = NULL;
2497: }
2498: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2499: Ccusp->mat = Cmat;
2500: Ccusp->mat->mat = Ccsr;
2501: Ccsr->num_rows = Ccusp->nrows;
2502: Ccsr->num_cols = n;
2503: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2504: cusparseCreateMatDescr(&Cmat->descr);
2505: cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
2506: cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
2507: cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));
2508: cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));
2509: cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2510: cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
2511: cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
2512: cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
2513: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2514: thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2515: c->nz = 0;
2516: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2517: Ccsr->values = new THRUSTARRAY(c->nz);
2518: goto finalizesym;
2519: }
2523: Acsr = (CsrMatrix*)Amat->mat;
2524: if (!biscompressed) {
2525: Bcsr = (CsrMatrix*)Bmat->mat;
2526: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2527: BmatSpDescr = Bmat->matDescr;
2528: #endif
2529: } else { /* we need to use row offsets for the full matrix */
2530: CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2531: Bcsr = new CsrMatrix;
2532: Bcsr->num_rows = B->rmap->n;
2533: Bcsr->num_cols = cBcsr->num_cols;
2534: Bcsr->num_entries = cBcsr->num_entries;
2535: Bcsr->column_indices = cBcsr->column_indices;
2536: Bcsr->values = cBcsr->values;
2537: if (!Bcusp->rowoffsets_gpu) {
2538: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2539: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2540: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2541: }
2542: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2543: mmdata->Bcsr = Bcsr;
2544: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2545: if (Bcsr->num_rows && Bcsr->num_cols) {
2546: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2547: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2548: Bcsr->values->data().get(),
2549: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2550: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
2551: }
2552: BmatSpDescr = mmdata->matSpBDescr;
2553: #endif
2554: }
2557: /* precompute flops count */
2558: if (ptype == MATPRODUCT_AB) {
2559: for (i=0, flops = 0; i<A->rmap->n; i++) {
2560: const PetscInt st = a->i[i];
2561: const PetscInt en = a->i[i+1];
2562: for (j=st; j<en; j++) {
2563: const PetscInt brow = a->j[j];
2564: flops += 2.*(b->i[brow+1] - b->i[brow]);
2565: }
2566: }
2567: } else if (ptype == MATPRODUCT_AtB) {
2568: for (i=0, flops = 0; i<A->rmap->n; i++) {
2569: const PetscInt anzi = a->i[i+1] - a->i[i];
2570: const PetscInt bnzi = b->i[i+1] - b->i[i];
2571: flops += (2.*anzi)*bnzi;
2572: }
2573: } else { /* TODO */
2574: flops = 0.;
2575: }
2577: mmdata->flops = flops;
2578: PetscLogGpuTimeBegin();
2580: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2581: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2582: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2583: NULL, NULL, NULL,
2584: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2585: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
2586: cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2587: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2588: {
2589: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2590: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2591: */
2592: void* dBuffer1 = NULL;
2593: void* dBuffer2 = NULL;
2594: void* dBuffer3 = NULL;
2595: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2596: size_t bufferSize1 = 0;
2597: size_t bufferSize2 = 0;
2598: size_t bufferSize3 = 0;
2599: size_t bufferSize4 = 0;
2600: size_t bufferSize5 = 0;
2602: /*----------------------------------------------------------------------*/
2603: /* ask bufferSize1 bytes for external memory */
2604: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2605: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2606: &bufferSize1, NULL);stat;
2607: cudaMalloc((void**) &dBuffer1, bufferSize1);
2608: /* inspect the matrices A and B to understand the memory requirement for the next step */
2609: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2610: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2611: &bufferSize1, dBuffer1);stat;
2613: /*----------------------------------------------------------------------*/
2614: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2615: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2616: &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);stat;
2617: cudaMalloc((void**) &dBuffer2, bufferSize2);
2618: cudaMalloc((void**) &dBuffer3, bufferSize3);
2619: cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);
2620: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2621: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2622: &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);stat;
2623: cudaFree(dBuffer1);
2624: cudaFree(dBuffer2);
2626: /*----------------------------------------------------------------------*/
2627: /* get matrix C non-zero entries C_nnz1 */
2628: cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2629: c->nz = (PetscInt) C_nnz1;
2630: /* allocate matrix C */
2631: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);cudaPeekAtLastError(); /* catch out of memory errors */
2632: Ccsr->values = new THRUSTARRAY(c->nz);cudaPeekAtLastError(); /* catch out of memory errors */
2633: /* update matC with the new pointers */
2634: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2635: Ccsr->values->data().get());stat;
2637: /*----------------------------------------------------------------------*/
2638: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2639: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2640: &bufferSize5, NULL);stat;
2641: cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);
2642: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644: &bufferSize5, mmdata->dBuffer5);stat;
2645: cudaFree(dBuffer3);
2646: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2647: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2648: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2649: mmdata->spgemmDesc);stat;
2650: PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2651: }
2652: #else
2653: size_t bufSize2;
2654: /* ask bufferSize bytes for external memory */
2655: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2656: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2657: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2658: mmdata->spgemmDesc, &bufSize2, NULL);stat;
2659: cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);
2660: /* inspect the matrices A and B to understand the memory requirement for the next step */
2661: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2662: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2663: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2664: mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);stat;
2665: /* ask bufferSize again bytes for external memory */
2666: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2667: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2668: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2669: mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);stat;
2670: /* The CUSPARSE documentation is not clear, nor the API
2671: We need both buffers to perform the operations properly!
2672: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2673: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2674: is stored in the descriptor! What a messy API... */
2675: cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);
2676: /* compute the intermediate product of A * B */
2677: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2678: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2679: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2680: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);stat;
2681: /* get matrix C non-zero entries C_nnz1 */
2682: cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2683: c->nz = (PetscInt) C_nnz1;
2684: PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2685: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2686: cudaPeekAtLastError(); /* catch out of memory errors */
2687: Ccsr->values = new THRUSTARRAY(c->nz);
2688: cudaPeekAtLastError(); /* catch out of memory errors */
2689: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2690: Ccsr->values->data().get());stat;
2691: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2692: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2693: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);stat;
2694: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2695: #else
2696: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);
2697: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2698: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2699: Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2700: Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2701: Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);stat;
2702: c->nz = cnz;
2703: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2704: cudaPeekAtLastError(); /* catch out of memory errors */
2705: Ccsr->values = new THRUSTARRAY(c->nz);
2706: cudaPeekAtLastError(); /* catch out of memory errors */
2708: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2709: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2710: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2711: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2712: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2713: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2714: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2715: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2716: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());stat;
2717: #endif
2718: PetscLogGpuFlops(mmdata->flops);
2719: PetscLogGpuTimeEnd();
2720: finalizesym:
2721: c->singlemalloc = PETSC_FALSE;
2722: c->free_a = PETSC_TRUE;
2723: c->free_ij = PETSC_TRUE;
2724: PetscMalloc1(m+1,&c->i);
2725: PetscMalloc1(c->nz,&c->j);
2726: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2727: PetscInt *d_i = c->i;
2728: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2729: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2730: ii = *Ccsr->row_offsets;
2731: jj = *Ccsr->column_indices;
2732: if (ciscompressed) d_i = c->compressedrow.i;
2733: cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2734: cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2735: } else {
2736: PetscInt *d_i = c->i;
2737: if (ciscompressed) d_i = c->compressedrow.i;
2738: cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2739: cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2740: }
2741: if (ciscompressed) { /* need to expand host row offsets */
2742: PetscInt r = 0;
2743: c->i[0] = 0;
2744: for (k = 0; k < c->compressedrow.nrows; k++) {
2745: const PetscInt next = c->compressedrow.rindex[k];
2746: const PetscInt old = c->compressedrow.i[k];
2747: for (; r < next; r++) c->i[r+1] = old;
2748: }
2749: for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2750: }
2751: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2752: PetscMalloc1(m,&c->ilen);
2753: PetscMalloc1(m,&c->imax);
2754: c->maxnz = c->nz;
2755: c->nonzerorowcnt = 0;
2756: c->rmax = 0;
2757: for (k = 0; k < m; k++) {
2758: const PetscInt nn = c->i[k+1] - c->i[k];
2759: c->ilen[k] = c->imax[k] = nn;
2760: c->nonzerorowcnt += (PetscInt)!!nn;
2761: c->rmax = PetscMax(c->rmax,nn);
2762: }
2763: MatMarkDiagonal_SeqAIJ(C);
2764: PetscMalloc1(c->nz,&c->a);
2765: Ccsr->num_entries = c->nz;
2767: C->nonzerostate++;
2768: PetscLayoutSetUp(C->rmap);
2769: PetscLayoutSetUp(C->cmap);
2770: Ccusp->nonzerostate = C->nonzerostate;
2771: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2772: C->preallocated = PETSC_TRUE;
2773: C->assembled = PETSC_FALSE;
2774: C->was_assembled = PETSC_FALSE;
2775: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2776: mmdata->reusesym = PETSC_TRUE;
2777: C->offloadmask = PETSC_OFFLOAD_GPU;
2778: }
2779: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2780: return 0;
2781: }
2783: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2785: /* handles sparse or dense B */
2786: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2787: {
2788: Mat_Product *product = mat->product;
2790: PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2792: MatCheckProduct(mat,1);
2793: PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2794: if (!product->A->boundtocpu && !product->B->boundtocpu) {
2795: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2796: }
2797: if (product->type == MATPRODUCT_ABC) {
2798: Ciscusp = PETSC_FALSE;
2799: if (!product->C->boundtocpu) {
2800: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2801: }
2802: }
2803: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2804: PetscBool usecpu = PETSC_FALSE;
2805: switch (product->type) {
2806: case MATPRODUCT_AB:
2807: if (product->api_user) {
2808: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2809: PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2810: PetscOptionsEnd();
2811: } else {
2812: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2813: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2814: PetscOptionsEnd();
2815: }
2816: break;
2817: case MATPRODUCT_AtB:
2818: if (product->api_user) {
2819: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2820: PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2821: PetscOptionsEnd();
2822: } else {
2823: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2824: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2825: PetscOptionsEnd();
2826: }
2827: break;
2828: case MATPRODUCT_PtAP:
2829: if (product->api_user) {
2830: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2831: PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2832: PetscOptionsEnd();
2833: } else {
2834: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2835: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2836: PetscOptionsEnd();
2837: }
2838: break;
2839: case MATPRODUCT_RARt:
2840: if (product->api_user) {
2841: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2842: PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2843: PetscOptionsEnd();
2844: } else {
2845: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2846: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2847: PetscOptionsEnd();
2848: }
2849: break;
2850: case MATPRODUCT_ABC:
2851: if (product->api_user) {
2852: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2853: PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2854: PetscOptionsEnd();
2855: } else {
2856: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2857: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2858: PetscOptionsEnd();
2859: }
2860: break;
2861: default:
2862: break;
2863: }
2864: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2865: }
2866: /* dispatch */
2867: if (isdense) {
2868: switch (product->type) {
2869: case MATPRODUCT_AB:
2870: case MATPRODUCT_AtB:
2871: case MATPRODUCT_ABt:
2872: case MATPRODUCT_PtAP:
2873: case MATPRODUCT_RARt:
2874: if (product->A->boundtocpu) {
2875: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2876: } else {
2877: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2878: }
2879: break;
2880: case MATPRODUCT_ABC:
2881: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2882: break;
2883: default:
2884: break;
2885: }
2886: } else if (Biscusp && Ciscusp) {
2887: switch (product->type) {
2888: case MATPRODUCT_AB:
2889: case MATPRODUCT_AtB:
2890: case MATPRODUCT_ABt:
2891: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2892: break;
2893: case MATPRODUCT_PtAP:
2894: case MATPRODUCT_RARt:
2895: case MATPRODUCT_ABC:
2896: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2897: break;
2898: default:
2899: break;
2900: }
2901: } else { /* fallback for AIJ */
2902: MatProductSetFromOptions_SeqAIJ(mat);
2903: }
2904: return 0;
2905: }
2907: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2908: {
2909: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2910: return 0;
2911: }
2913: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2914: {
2915: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2916: return 0;
2917: }
2919: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2920: {
2921: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2922: return 0;
2923: }
2925: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2926: {
2927: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2928: return 0;
2929: }
2931: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2932: {
2933: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2934: return 0;
2935: }
2937: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2938: {
2939: int i = blockIdx.x*blockDim.x + threadIdx.x;
2940: if (i < n) y[idx[i]] += x[i];
2941: }
2943: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2944: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2945: {
2946: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2947: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2948: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2949: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2950: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2951: PetscBool compressed;
2952: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2953: PetscInt nx,ny;
2954: #endif
2957: if (!a->nz) {
2958: if (!yy) VecSet_SeqCUDA(zz,0);
2959: else VecCopy_SeqCUDA(yy,zz);
2960: return 0;
2961: }
2962: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2963: MatSeqAIJCUSPARSECopyToGPU(A);
2964: if (!trans) {
2965: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2967: } else {
2968: if (herm || !A->form_explicit_transpose) {
2969: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2970: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2971: } else {
2972: if (!cusparsestruct->matTranspose) MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2973: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2974: }
2975: }
2976: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2977: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2979: try {
2980: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2981: if (yy == zz) VecCUDAGetArray(zz,&zarray); /* read & write zz, so need to get uptodate zarray on GPU */
2982: else VecCUDAGetArrayWrite(zz,&zarray); /* write zz, so no need to init zarray on GPU */
2984: PetscLogGpuTimeBegin();
2985: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2986: /* z = A x + beta y.
2987: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2988: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2989: */
2990: xptr = xarray;
2991: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2992: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2993: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2994: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2995: allocated to accommodate different uses. So we get the length info directly from mat.
2996: */
2997: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2998: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2999: nx = mat->num_cols;
3000: ny = mat->num_rows;
3001: }
3002: #endif
3003: } else {
3004: /* z = A^T x + beta y
3005: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3006: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3007: */
3008: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3009: dptr = zarray;
3010: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3011: if (compressed) { /* Scatter x to work vector */
3012: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3013: thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3014: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3015: VecCUDAEqualsReverse());
3016: }
3017: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3018: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3019: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3020: nx = mat->num_rows;
3021: ny = mat->num_cols;
3022: }
3023: #endif
3024: }
3026: /* csr_spmv does y = alpha op(A) x + beta y */
3027: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3028: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3030: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3031: cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);
3032: cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);
3033: PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3034: matstruct->matDescr,
3035: matstruct->cuSpMV[opA].vecXDescr, beta,
3036: matstruct->cuSpMV[opA].vecYDescr,
3037: cusparse_scalartype,
3038: cusparsestruct->spmvAlg,
3039: &matstruct->cuSpMV[opA].spmvBufferSize));
3040: cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);
3042: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3043: } else {
3044: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3045: cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);
3046: cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);
3047: }
3049: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3050: matstruct->alpha_one,
3051: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3052: matstruct->cuSpMV[opA].vecXDescr,
3053: beta,
3054: matstruct->cuSpMV[opA].vecYDescr,
3055: cusparse_scalartype,
3056: cusparsestruct->spmvAlg,
3057: matstruct->cuSpMV[opA].spmvBuffer));
3058: #else
3059: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3060: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3061: mat->num_rows, mat->num_cols,
3062: mat->num_entries, matstruct->alpha_one, matstruct->descr,
3063: mat->values->data().get(), mat->row_offsets->data().get(),
3064: mat->column_indices->data().get(), xptr, beta,
3065: dptr));
3066: #endif
3067: } else {
3068: if (cusparsestruct->nrows) {
3069: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3071: #else
3072: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3073: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3074: matstruct->alpha_one, matstruct->descr, hybMat,
3075: xptr, beta,
3076: dptr));
3077: #endif
3078: }
3079: }
3080: PetscLogGpuTimeEnd();
3082: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3083: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3084: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3085: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3086: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3087: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3088: }
3089: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3090: VecSet_SeqCUDA(zz,0);
3091: }
3093: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3094: if (compressed) {
3095: PetscLogGpuTimeBegin();
3096: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3097: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3098: prevent that. So I just add a ScatterAdd kernel.
3099: */
3100: #if 0
3101: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3102: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3103: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3104: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3105: VecCUDAPlusEquals());
3106: #else
3107: PetscInt n = matstruct->cprowIndices->size();
3108: ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3109: #endif
3110: PetscLogGpuTimeEnd();
3111: }
3112: } else {
3113: if (yy && yy != zz) {
3114: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3115: }
3116: }
3117: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3118: if (yy == zz) VecCUDARestoreArray(zz,&zarray);
3119: else VecCUDARestoreArrayWrite(zz,&zarray);
3120: } catch(char *ex) {
3121: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3122: }
3123: if (yy) {
3124: PetscLogGpuFlops(2.0*a->nz);
3125: } else {
3126: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3127: }
3128: return 0;
3129: }
3131: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3132: {
3133: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3134: return 0;
3135: }
3137: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3138: {
3139: PetscObjectState onnz = A->nonzerostate;
3140: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3142: MatAssemblyEnd_SeqAIJ(A,mode);
3143: if (onnz != A->nonzerostate && cusp->deviceMat) {
3145: PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3146: cudaFree(cusp->deviceMat);
3147: cusp->deviceMat = NULL;
3148: }
3149: return 0;
3150: }
3152: /* --------------------------------------------------------------------------------*/
3153: /*@
3154: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3155: (the default parallel PETSc format). This matrix will ultimately pushed down
3156: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3157: assembly performance the user should preallocate the matrix storage by setting
3158: the parameter nz (or the array nnz). By setting these parameters accurately,
3159: performance during matrix assembly can be increased by more than a factor of 50.
3161: Collective
3163: Input Parameters:
3164: + comm - MPI communicator, set to PETSC_COMM_SELF
3165: . m - number of rows
3166: . n - number of columns
3167: . nz - number of nonzeros per row (same for all rows)
3168: - nnz - array containing the number of nonzeros in the various rows
3169: (possibly different for each row) or NULL
3171: Output Parameter:
3172: . A - the matrix
3174: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3175: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3176: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3178: Notes:
3179: If nnz is given then nz is ignored
3181: The AIJ format (also called the Yale sparse matrix format or
3182: compressed row storage), is fully compatible with standard Fortran 77
3183: storage. That is, the stored row and column indices can begin at
3184: either one (as in Fortran) or zero. See the users' manual for details.
3186: Specify the preallocated storage with either nz or nnz (not both).
3187: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3188: allocation. For large problems you MUST preallocate memory or you
3189: will get TERRIBLE performance, see the users' manual chapter on matrices.
3191: By default, this format uses inodes (identical nodes) when possible, to
3192: improve numerical efficiency of matrix-vector products and solves. We
3193: search for consecutive rows with the same nonzero structure, thereby
3194: reusing matrix information to achieve increased efficiency.
3196: Level: intermediate
3198: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3199: @*/
3200: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3201: {
3202: MatCreate(comm,A);
3203: MatSetSizes(*A,m,n,m,n);
3204: MatSetType(*A,MATSEQAIJCUSPARSE);
3205: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3206: return 0;
3207: }
3209: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3210: {
3211: if (A->factortype == MAT_FACTOR_NONE) {
3212: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3213: } else {
3214: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3215: }
3216: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3217: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3218: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);
3219: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3220: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3221: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3222: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3223: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3224: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3225: PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3226: MatDestroy_SeqAIJ(A);
3227: return 0;
3228: }
3230: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3231: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3232: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3233: {
3234: MatDuplicate_SeqAIJ(A,cpvalues,B);
3235: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3236: return 0;
3237: }
3239: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3240: {
3241: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3242: Mat_SeqAIJCUSPARSE *cy;
3243: Mat_SeqAIJCUSPARSE *cx;
3244: PetscScalar *ay;
3245: const PetscScalar *ax;
3246: CsrMatrix *csry,*csrx;
3248: cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3249: cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3250: if (X->ops->axpy != Y->ops->axpy) {
3251: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3252: MatAXPY_SeqAIJ(Y,a,X,str);
3253: return 0;
3254: }
3255: /* if we are here, it means both matrices are bound to GPU */
3256: MatSeqAIJCUSPARSECopyToGPU(Y);
3257: MatSeqAIJCUSPARSECopyToGPU(X);
3260: csry = (CsrMatrix*)cy->mat->mat;
3261: csrx = (CsrMatrix*)cx->mat->mat;
3262: /* see if we can turn this into a cublas axpy */
3263: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3264: bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3265: if (eq) {
3266: eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3267: }
3268: if (eq) str = SAME_NONZERO_PATTERN;
3269: }
3270: /* spgeam is buggy with one column */
3271: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3273: if (str == SUBSET_NONZERO_PATTERN) {
3274: PetscScalar b = 1.0;
3275: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3276: size_t bufferSize;
3277: void *buffer;
3278: #endif
3280: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3281: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3282: cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);
3283: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3284: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3285: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3286: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3287: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3288: cudaMalloc(&buffer,bufferSize);
3289: PetscLogGpuTimeBegin();
3290: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3291: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3292: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3293: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3294: PetscLogGpuFlops(x->nz + y->nz);
3295: PetscLogGpuTimeEnd();
3296: cudaFree(buffer);
3297: #else
3298: PetscLogGpuTimeBegin();
3299: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3300: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3301: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3302: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3303: PetscLogGpuFlops(x->nz + y->nz);
3304: PetscLogGpuTimeEnd();
3305: #endif
3306: cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);
3307: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3308: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3309: MatSeqAIJInvalidateDiagonal(Y);
3310: } else if (str == SAME_NONZERO_PATTERN) {
3311: cublasHandle_t cublasv2handle;
3312: PetscBLASInt one = 1, bnz = 1;
3314: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3315: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3316: PetscCUBLASGetHandle(&cublasv2handle);
3317: PetscBLASIntCast(x->nz,&bnz);
3318: PetscLogGpuTimeBegin();
3319: cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);
3320: PetscLogGpuFlops(2.0*bnz);
3321: PetscLogGpuTimeEnd();
3322: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3323: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3324: MatSeqAIJInvalidateDiagonal(Y);
3325: } else {
3326: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3327: MatAXPY_SeqAIJ(Y,a,X,str);
3328: }
3329: return 0;
3330: }
3332: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3333: {
3334: Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data;
3335: PetscScalar *ay;
3336: cublasHandle_t cublasv2handle;
3337: PetscBLASInt one = 1, bnz = 1;
3339: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3340: PetscCUBLASGetHandle(&cublasv2handle);
3341: PetscBLASIntCast(y->nz,&bnz);
3342: PetscLogGpuTimeBegin();
3343: cublasXscal(cublasv2handle,bnz,&a,ay,one);
3344: PetscLogGpuFlops(bnz);
3345: PetscLogGpuTimeEnd();
3346: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3347: MatSeqAIJInvalidateDiagonal(Y);
3348: return 0;
3349: }
3351: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3352: {
3353: PetscBool both = PETSC_FALSE;
3354: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3356: if (A->factortype == MAT_FACTOR_NONE) {
3357: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3358: if (spptr->mat) {
3359: CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3360: if (matrix->values) {
3361: both = PETSC_TRUE;
3362: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3363: }
3364: }
3365: if (spptr->matTranspose) {
3366: CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3367: if (matrix->values) {
3368: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3369: }
3370: }
3371: }
3372: PetscArrayzero(a->a,a->i[A->rmap->n]);
3373: MatSeqAIJInvalidateDiagonal(A);
3374: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3375: else A->offloadmask = PETSC_OFFLOAD_CPU;
3376: return 0;
3377: }
3379: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3380: {
3381: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3383: if (A->factortype != MAT_FACTOR_NONE) {
3384: A->boundtocpu = flg;
3385: return 0;
3386: }
3387: if (flg) {
3388: MatSeqAIJCUSPARSECopyFromGPU(A);
3390: A->ops->scale = MatScale_SeqAIJ;
3391: A->ops->axpy = MatAXPY_SeqAIJ;
3392: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3393: A->ops->mult = MatMult_SeqAIJ;
3394: A->ops->multadd = MatMultAdd_SeqAIJ;
3395: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3396: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3397: A->ops->multhermitiantranspose = NULL;
3398: A->ops->multhermitiantransposeadd = NULL;
3399: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3400: PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));
3401: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3402: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3403: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3404: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3405: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3406: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3407: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3408: } else {
3409: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3410: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3411: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3412: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3413: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3414: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3415: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3416: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3417: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3418: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3419: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3420: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3421: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3422: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3423: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3424: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3425: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3427: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3428: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3429: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3430: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3431: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3432: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3433: }
3434: A->boundtocpu = flg;
3435: if (flg && a->inode.size) {
3436: a->inode.use = PETSC_TRUE;
3437: } else {
3438: a->inode.use = PETSC_FALSE;
3439: }
3440: return 0;
3441: }
3443: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3444: {
3445: Mat B;
3447: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* first use of CUSPARSE may be via MatConvert */
3448: if (reuse == MAT_INITIAL_MATRIX) {
3449: MatDuplicate(A,MAT_COPY_VALUES,newmat);
3450: } else if (reuse == MAT_REUSE_MATRIX) {
3451: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3452: }
3453: B = *newmat;
3455: PetscFree(B->defaultvectype);
3456: PetscStrallocpy(VECCUDA,&B->defaultvectype);
3458: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3459: if (B->factortype == MAT_FACTOR_NONE) {
3460: Mat_SeqAIJCUSPARSE *spptr;
3461: PetscNew(&spptr);
3462: cusparseCreate(&spptr->handle);
3463: cusparseSetStream(spptr->handle,PetscDefaultCudaStream);
3464: spptr->format = MAT_CUSPARSE_CSR;
3465: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3466: #if CUSPARSE_VERSION > 11301
3467: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3468: #else
3469: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3470: #endif
3471: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3472: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3473: #endif
3474: B->spptr = spptr;
3475: } else {
3476: Mat_SeqAIJCUSPARSETriFactors *spptr;
3478: PetscNew(&spptr);
3479: cusparseCreate(&spptr->handle);
3480: cusparseSetStream(spptr->handle,PetscDefaultCudaStream);
3481: B->spptr = spptr;
3482: }
3483: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3484: }
3485: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3486: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3487: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3488: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3489: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3490: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3492: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3493: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3494: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3495: #if defined(PETSC_HAVE_HYPRE)
3496: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3497: #endif
3498: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);
3499: return 0;
3500: }
3502: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3503: {
3504: MatCreate_SeqAIJ(B);
3505: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3506: return 0;
3507: }
3509: /*MC
3510: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3512: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3513: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3514: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3516: Options Database Keys:
3517: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3518: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3519: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3520: + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3522: Level: beginner
3524: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3525: M*/
3527: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3529: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3530: {
3531: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3532: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3533: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3534: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3535: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
3537: return 0;
3538: }
3540: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3541: {
3542: Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3544: if (!cusp) return 0;
3545: delete cusp->cooPerm;
3546: delete cusp->cooPerm_a;
3547: cusp->cooPerm = NULL;
3548: cusp->cooPerm_a = NULL;
3549: if (cusp->use_extended_coo) {
3550: cudaFree(cusp->jmap_d);
3551: cudaFree(cusp->perm_d);
3552: }
3553: cusp->use_extended_coo = PETSC_FALSE;
3554: return 0;
3555: }
3557: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3558: {
3559: if (*cusparsestruct) {
3560: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3561: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3562: delete (*cusparsestruct)->workVector;
3563: delete (*cusparsestruct)->rowoffsets_gpu;
3564: delete (*cusparsestruct)->cooPerm;
3565: delete (*cusparsestruct)->cooPerm_a;
3566: delete (*cusparsestruct)->csr2csc_i;
3567: if ((*cusparsestruct)->handle) cusparseDestroy((*cusparsestruct)->handle);
3568: if ((*cusparsestruct)->jmap_d) cudaFree((*cusparsestruct)->jmap_d);
3569: if ((*cusparsestruct)->perm_d) cudaFree((*cusparsestruct)->perm_d);
3570: PetscFree(*cusparsestruct);
3571: }
3572: return 0;
3573: }
3575: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3576: {
3577: if (*mat) {
3578: delete (*mat)->values;
3579: delete (*mat)->column_indices;
3580: delete (*mat)->row_offsets;
3581: delete *mat;
3582: *mat = 0;
3583: }
3584: return 0;
3585: }
3587: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3588: {
3589: if (*trifactor) {
3590: if ((*trifactor)->descr) cusparseDestroyMatDescr((*trifactor)->descr);
3591: if ((*trifactor)->solveInfo) cusparse_destroy_analysis_info((*trifactor)->solveInfo);
3592: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3593: if ((*trifactor)->solveBuffer) cudaFree((*trifactor)->solveBuffer);
3594: if ((*trifactor)->AA_h) cudaFreeHost((*trifactor)->AA_h);
3595: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3596: if ((*trifactor)->csr2cscBuffer) cudaFree((*trifactor)->csr2cscBuffer);
3597: #endif
3598: PetscFree(*trifactor);
3599: }
3600: return 0;
3601: }
3603: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3604: {
3605: CsrMatrix *mat;
3607: if (*matstruct) {
3608: if ((*matstruct)->mat) {
3609: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3610: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3611: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3612: #else
3613: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3614: cusparseDestroyHybMat(hybMat);
3615: #endif
3616: } else {
3617: mat = (CsrMatrix*)(*matstruct)->mat;
3618: CsrMatrix_Destroy(&mat);
3619: }
3620: }
3621: if ((*matstruct)->descr) cusparseDestroyMatDescr((*matstruct)->descr);
3622: delete (*matstruct)->cprowIndices;
3623: if ((*matstruct)->alpha_one) cudaFree((*matstruct)->alpha_one);
3624: if ((*matstruct)->beta_zero) cudaFree((*matstruct)->beta_zero);
3625: if ((*matstruct)->beta_one) cudaFree((*matstruct)->beta_one);
3627: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3628: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3629: if (mdata->matDescr) cusparseDestroySpMat(mdata->matDescr);
3630: for (int i=0; i<3; i++) {
3631: if (mdata->cuSpMV[i].initialized) {
3632: cudaFree(mdata->cuSpMV[i].spmvBuffer);
3633: cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);
3634: cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);
3635: }
3636: }
3637: #endif
3638: delete *matstruct;
3639: *matstruct = NULL;
3640: }
3641: return 0;
3642: }
3644: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3645: {
3646: if (*trifactors) {
3647: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3648: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3649: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3650: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3651: delete (*trifactors)->rpermIndices;
3652: delete (*trifactors)->cpermIndices;
3653: delete (*trifactors)->workVector;
3654: (*trifactors)->rpermIndices = NULL;
3655: (*trifactors)->cpermIndices = NULL;
3656: (*trifactors)->workVector = NULL;
3657: if ((*trifactors)->a_band_d) cudaFree((*trifactors)->a_band_d);
3658: if ((*trifactors)->i_band_d) cudaFree((*trifactors)->i_band_d);
3659: (*trifactors)->init_dev_prop = PETSC_FALSE;
3660: }
3661: return 0;
3662: }
3664: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3665: {
3666: cusparseHandle_t handle;
3668: if (*trifactors) {
3669: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3670: if (handle = (*trifactors)->handle) {
3671: cusparseDestroy(handle);
3672: }
3673: PetscFree(*trifactors);
3674: }
3675: return 0;
3676: }
3678: struct IJCompare
3679: {
3680: __host__ __device__
3681: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3682: {
3683: if (t1.get<0>() < t2.get<0>()) return true;
3684: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3685: return false;
3686: }
3687: };
3689: struct IJEqual
3690: {
3691: __host__ __device__
3692: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3693: {
3694: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3695: return true;
3696: }
3697: };
3699: struct IJDiff
3700: {
3701: __host__ __device__
3702: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3703: {
3704: return t1 == t2 ? 0 : 1;
3705: }
3706: };
3708: struct IJSum
3709: {
3710: __host__ __device__
3711: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3712: {
3713: return t1||t2;
3714: }
3715: };
3717: #include <thrust/iterator/discard_iterator.h>
3718: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3719: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3720: {
3721: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3722: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3723: THRUSTARRAY *cooPerm_v = NULL;
3724: thrust::device_ptr<const PetscScalar> d_v;
3725: CsrMatrix *matrix;
3726: PetscInt n;
3730: if (!cusp->cooPerm) {
3731: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3732: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3733: return 0;
3734: }
3735: matrix = (CsrMatrix*)cusp->mat->mat;
3737: if (!v) {
3738: if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3739: goto finalize;
3740: }
3741: n = cusp->cooPerm->size();
3742: if (isCudaMem(v)) {
3743: d_v = thrust::device_pointer_cast(v);
3744: } else {
3745: cooPerm_v = new THRUSTARRAY(n);
3746: cooPerm_v->assign(v,v+n);
3747: d_v = cooPerm_v->data();
3748: PetscLogCpuToGpu(n*sizeof(PetscScalar));
3749: }
3750: PetscLogGpuTimeBegin();
3751: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3752: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3753: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3754: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3755: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3756: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3757: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3758: */
3759: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3760: thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3761: delete cooPerm_w;
3762: } else {
3763: /* all nonzeros in d_v[] are unique entries */
3764: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3765: matrix->values->begin()));
3766: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3767: matrix->values->end()));
3768: thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3769: }
3770: } else {
3771: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3772: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3773: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3774: } else {
3775: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3776: matrix->values->begin()));
3777: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3778: matrix->values->end()));
3779: thrust::for_each(zibit,zieit,VecCUDAEquals());
3780: }
3781: }
3782: PetscLogGpuTimeEnd();
3783: finalize:
3784: delete cooPerm_v;
3785: A->offloadmask = PETSC_OFFLOAD_GPU;
3786: PetscObjectStateIncrease((PetscObject)A);
3787: /* shorter version of MatAssemblyEnd_SeqAIJ */
3788: PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);
3789: PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3790: PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);
3791: a->reallocs = 0;
3792: A->info.mallocs += 0;
3793: A->info.nz_unneeded = 0;
3794: A->assembled = A->was_assembled = PETSC_TRUE;
3795: A->num_ass++;
3796: return 0;
3797: }
3799: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3800: {
3801: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3804: if (!cusp) return 0;
3805: if (destroy) {
3806: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3807: delete cusp->csr2csc_i;
3808: cusp->csr2csc_i = NULL;
3809: }
3810: A->transupdated = PETSC_FALSE;
3811: return 0;
3812: }
3814: #include <thrust/binary_search.h>
3815: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3816: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3817: {
3818: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3819: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3820: PetscInt cooPerm_n, nzr = 0;
3822: PetscLayoutSetUp(A->rmap);
3823: PetscLayoutSetUp(A->cmap);
3824: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3825: if (n != cooPerm_n) {
3826: delete cusp->cooPerm;
3827: delete cusp->cooPerm_a;
3828: cusp->cooPerm = NULL;
3829: cusp->cooPerm_a = NULL;
3830: }
3831: if (n) {
3832: THRUSTINTARRAY d_i(n);
3833: THRUSTINTARRAY d_j(n);
3834: THRUSTINTARRAY ii(A->rmap->n);
3836: if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
3837: if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3839: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3840: d_i.assign(coo_i,coo_i+n);
3841: d_j.assign(coo_j,coo_j+n);
3843: /* Ex.
3844: n = 6
3845: coo_i = [3,3,1,4,1,4]
3846: coo_j = [3,2,2,5,2,6]
3847: */
3848: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3849: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3851: PetscLogGpuTimeBegin();
3852: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3853: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3854: *cusp->cooPerm_a = d_i; /* copy the sorted array */
3855: THRUSTINTARRAY w = d_j;
3857: /*
3858: d_i = [1,1,3,3,4,4]
3859: d_j = [2,2,2,3,5,6]
3860: cooPerm = [2,4,1,0,3,5]
3861: */
3862: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3864: /*
3865: d_i = [1,3,3,4,4,x]
3866: ^ekey
3867: d_j = [2,2,3,5,6,x]
3868: ^nekye
3869: */
3870: if (nekey == ekey) { /* all entries are unique */
3871: delete cusp->cooPerm_a;
3872: cusp->cooPerm_a = NULL;
3873: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3874: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3875: adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3876: adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3877: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3878: w[0] = 0;
3879: thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
3880: thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3881: }
3882: thrust::counting_iterator<PetscInt> search_begin(0);
3883: thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3884: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3885: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3886: PetscLogGpuTimeEnd();
3888: MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3889: a->singlemalloc = PETSC_FALSE;
3890: a->free_a = PETSC_TRUE;
3891: a->free_ij = PETSC_TRUE;
3892: PetscMalloc1(A->rmap->n+1,&a->i);
3893: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3894: cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);
3895: a->nz = a->maxnz = a->i[A->rmap->n];
3896: a->rmax = 0;
3897: PetscMalloc1(a->nz,&a->a);
3898: PetscMalloc1(a->nz,&a->j);
3899: cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);
3900: if (!a->ilen) PetscMalloc1(A->rmap->n,&a->ilen);
3901: if (!a->imax) PetscMalloc1(A->rmap->n,&a->imax);
3902: for (PetscInt i = 0; i < A->rmap->n; i++) {
3903: const PetscInt nnzr = a->i[i+1] - a->i[i];
3904: nzr += (PetscInt)!!(nnzr);
3905: a->ilen[i] = a->imax[i] = nnzr;
3906: a->rmax = PetscMax(a->rmax,nnzr);
3907: }
3908: a->nonzerorowcnt = nzr;
3909: A->preallocated = PETSC_TRUE;
3910: PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3911: MatMarkDiagonal_SeqAIJ(A);
3912: } else {
3913: MatSeqAIJSetPreallocation(A,0,NULL);
3914: }
3915: MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
3917: /* We want to allocate the CUSPARSE struct for matvec now.
3918: The code is so convoluted now that I prefer to copy zeros */
3919: PetscArrayzero(a->a,a->nz);
3920: MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3921: A->offloadmask = PETSC_OFFLOAD_CPU;
3922: MatSeqAIJCUSPARSECopyToGPU(A);
3923: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3924: return 0;
3925: }
3927: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3928: {
3929: Mat_SeqAIJ *seq;
3930: Mat_SeqAIJCUSPARSE *dev;
3931: PetscBool coo_basic = PETSC_TRUE;
3932: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
3934: MatResetPreallocationCOO_SeqAIJ(mat);
3935: MatResetPreallocationCOO_SeqAIJCUSPARSE(mat);
3936: if (coo_i) {
3937: PetscGetMemType(coo_i,&mtype);
3938: if (PetscMemTypeHost(mtype)) {
3939: for (PetscCount k=0; k<coo_n; k++) {
3940: if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
3941: }
3942: }
3943: }
3945: if (coo_basic) { /* i,j are on device or do not contain negative indices */
3946: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);
3947: } else {
3948: MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);
3949: mat->offloadmask = PETSC_OFFLOAD_CPU;
3950: MatSeqAIJCUSPARSECopyToGPU(mat);
3951: seq = static_cast<Mat_SeqAIJ*>(mat->data);
3952: dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
3953: cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));
3954: cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
3955: cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));
3956: cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);
3957: dev->use_extended_coo = PETSC_TRUE;
3958: }
3959: return 0;
3960: }
3962: __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
3963: {
3964: PetscCount i = blockIdx.x*blockDim.x + threadIdx.x;
3965: const PetscCount grid_size = gridDim.x * blockDim.x;
3966: for (; i<nnz; i+= grid_size) {
3967: PetscScalar sum = 0.0;
3968: for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
3969: a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
3970: }
3971: }
3973: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3974: {
3975: Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data;
3976: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
3977: PetscCount Annz = seq->nz;
3978: PetscMemType memtype;
3979: const PetscScalar *v1 = v;
3980: PetscScalar *Aa;
3982: if (dev->use_extended_coo) {
3983: PetscGetMemType(v,&memtype);
3984: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3985: cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));
3986: cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
3987: }
3989: if (imode == INSERT_VALUES) MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);
3990: else MatSeqAIJCUSPARSEGetArray(A,&Aa);
3992: if (Annz) {
3993: MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
3994: cudaPeekAtLastError();
3995: }
3997: if (imode == INSERT_VALUES) MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);
3998: else MatSeqAIJCUSPARSERestoreArray(A,&Aa);
4000: if (PetscMemTypeHost(memtype)) cudaFree((void*)v1);
4001: } else {
4002: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);
4003: }
4004: return 0;
4005: }
4007: /*@C
4008: MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4010: Not collective
4012: Input Parameters:
4013: + A - the matrix
4014: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4016: Output Parameters:
4017: + ia - the CSR row pointers
4018: - ja - the CSR column indices
4020: Level: developer
4022: Notes:
4023: When compressed is true, the CSR structure does not contain empty rows
4025: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4026: @*/
4027: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4028: {
4029: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4030: CsrMatrix *csr;
4031: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4034: if (!i || !j) return 0;
4037: MatSeqAIJCUSPARSECopyToGPU(A);
4039: csr = (CsrMatrix*)cusp->mat->mat;
4040: if (i) {
4041: if (!compressed && a->compressedrow.use) { /* need full row offset */
4042: if (!cusp->rowoffsets_gpu) {
4043: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4044: cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4045: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4046: }
4047: *i = cusp->rowoffsets_gpu->data().get();
4048: } else *i = csr->row_offsets->data().get();
4049: }
4050: if (j) *j = csr->column_indices->data().get();
4051: return 0;
4052: }
4054: /*@C
4055: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4057: Not collective
4059: Input Parameters:
4060: + A - the matrix
4061: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4063: Output Parameters:
4064: + ia - the CSR row pointers
4065: - ja - the CSR column indices
4067: Level: developer
4069: .seealso: MatSeqAIJCUSPARSEGetIJ()
4070: @*/
4071: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4072: {
4075: if (i) *i = NULL;
4076: if (j) *j = NULL;
4077: return 0;
4078: }
4080: /*@C
4081: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4083: Not Collective
4085: Input Parameter:
4086: . A - a MATSEQAIJCUSPARSE matrix
4088: Output Parameter:
4089: . a - pointer to the device data
4091: Level: developer
4093: Notes: may trigger host-device copies if up-to-date matrix data is on host
4095: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4096: @*/
4097: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4098: {
4099: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4100: CsrMatrix *csr;
4106: MatSeqAIJCUSPARSECopyToGPU(A);
4108: csr = (CsrMatrix*)cusp->mat->mat;
4110: *a = csr->values->data().get();
4111: return 0;
4112: }
4114: /*@C
4115: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4117: Not Collective
4119: Input Parameter:
4120: . A - a MATSEQAIJCUSPARSE matrix
4122: Output Parameter:
4123: . a - pointer to the device data
4125: Level: developer
4127: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4128: @*/
4129: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4130: {
4134: *a = NULL;
4135: return 0;
4136: }
4138: /*@C
4139: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4141: Not Collective
4143: Input Parameter:
4144: . A - a MATSEQAIJCUSPARSE matrix
4146: Output Parameter:
4147: . a - pointer to the device data
4149: Level: developer
4151: Notes: may trigger host-device copies if up-to-date matrix data is on host
4153: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4154: @*/
4155: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4156: {
4157: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4158: CsrMatrix *csr;
4164: MatSeqAIJCUSPARSECopyToGPU(A);
4166: csr = (CsrMatrix*)cusp->mat->mat;
4168: *a = csr->values->data().get();
4169: A->offloadmask = PETSC_OFFLOAD_GPU;
4170: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4171: return 0;
4172: }
4173: /*@C
4174: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4176: Not Collective
4178: Input Parameter:
4179: . A - a MATSEQAIJCUSPARSE matrix
4181: Output Parameter:
4182: . a - pointer to the device data
4184: Level: developer
4186: .seealso: MatSeqAIJCUSPARSEGetArray()
4187: @*/
4188: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4189: {
4193: MatSeqAIJInvalidateDiagonal(A);
4194: PetscObjectStateIncrease((PetscObject)A);
4195: *a = NULL;
4196: return 0;
4197: }
4199: /*@C
4200: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4202: Not Collective
4204: Input Parameter:
4205: . A - a MATSEQAIJCUSPARSE matrix
4207: Output Parameter:
4208: . a - pointer to the device data
4210: Level: developer
4212: Notes: does not trigger host-device copies and flags data validity on the GPU
4214: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4215: @*/
4216: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4217: {
4218: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4219: CsrMatrix *csr;
4226: csr = (CsrMatrix*)cusp->mat->mat;
4228: *a = csr->values->data().get();
4229: A->offloadmask = PETSC_OFFLOAD_GPU;
4230: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4231: return 0;
4232: }
4234: /*@C
4235: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4237: Not Collective
4239: Input Parameter:
4240: . A - a MATSEQAIJCUSPARSE matrix
4242: Output Parameter:
4243: . a - pointer to the device data
4245: Level: developer
4247: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4248: @*/
4249: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4250: {
4254: MatSeqAIJInvalidateDiagonal(A);
4255: PetscObjectStateIncrease((PetscObject)A);
4256: *a = NULL;
4257: return 0;
4258: }
4260: struct IJCompare4
4261: {
4262: __host__ __device__
4263: inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4264: {
4265: if (t1.get<0>() < t2.get<0>()) return true;
4266: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4267: return false;
4268: }
4269: };
4271: struct Shift
4272: {
4273: int _shift;
4275: Shift(int shift) : _shift(shift) {}
4276: __host__ __device__
4277: inline int operator() (const int &c)
4278: {
4279: return c + _shift;
4280: }
4281: };
4283: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4284: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4285: {
4286: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4287: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4288: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4289: CsrMatrix *Acsr,*Bcsr,*Ccsr;
4290: PetscInt Annz,Bnnz;
4291: cusparseStatus_t stat;
4292: PetscInt i,m,n,zero = 0;
4303: if (reuse == MAT_INITIAL_MATRIX) {
4304: m = A->rmap->n;
4305: n = A->cmap->n + B->cmap->n;
4306: MatCreate(PETSC_COMM_SELF,C);
4307: MatSetSizes(*C,m,n,m,n);
4308: MatSetType(*C,MATSEQAIJCUSPARSE);
4309: c = (Mat_SeqAIJ*)(*C)->data;
4310: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4311: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4312: Ccsr = new CsrMatrix;
4313: Cmat->cprowIndices = NULL;
4314: c->compressedrow.use = PETSC_FALSE;
4315: c->compressedrow.nrows = 0;
4316: c->compressedrow.i = NULL;
4317: c->compressedrow.rindex = NULL;
4318: Ccusp->workVector = NULL;
4319: Ccusp->nrows = m;
4320: Ccusp->mat = Cmat;
4321: Ccusp->mat->mat = Ccsr;
4322: Ccsr->num_rows = m;
4323: Ccsr->num_cols = n;
4324: cusparseCreateMatDescr(&Cmat->descr);
4325: cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
4326: cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4327: cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));
4328: cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));
4329: cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4330: cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4331: cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
4332: cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4333: MatSeqAIJCUSPARSECopyToGPU(A);
4334: MatSeqAIJCUSPARSECopyToGPU(B);
4338: Acsr = (CsrMatrix*)Acusp->mat->mat;
4339: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4340: Annz = (PetscInt)Acsr->column_indices->size();
4341: Bnnz = (PetscInt)Bcsr->column_indices->size();
4342: c->nz = Annz + Bnnz;
4343: Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4344: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4345: Ccsr->values = new THRUSTARRAY(c->nz);
4346: Ccsr->num_entries = c->nz;
4347: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4348: if (c->nz) {
4349: auto Acoo = new THRUSTINTARRAY32(Annz);
4350: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4351: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4352: THRUSTINTARRAY32 *Aroff,*Broff;
4354: if (a->compressedrow.use) { /* need full row offset */
4355: if (!Acusp->rowoffsets_gpu) {
4356: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4357: Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4358: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4359: }
4360: Aroff = Acusp->rowoffsets_gpu;
4361: } else Aroff = Acsr->row_offsets;
4362: if (b->compressedrow.use) { /* need full row offset */
4363: if (!Bcusp->rowoffsets_gpu) {
4364: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4365: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4366: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4367: }
4368: Broff = Bcusp->rowoffsets_gpu;
4369: } else Broff = Bcsr->row_offsets;
4370: PetscLogGpuTimeBegin();
4371: stat = cusparseXcsr2coo(Acusp->handle,
4372: Aroff->data().get(),
4373: Annz,
4374: m,
4375: Acoo->data().get(),
4376: CUSPARSE_INDEX_BASE_ZERO);stat;
4377: stat = cusparseXcsr2coo(Bcusp->handle,
4378: Broff->data().get(),
4379: Bnnz,
4380: m,
4381: Bcoo->data().get(),
4382: CUSPARSE_INDEX_BASE_ZERO);stat;
4383: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4384: auto Aperm = thrust::make_constant_iterator(1);
4385: auto Bperm = thrust::make_constant_iterator(0);
4386: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4387: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4388: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4389: #else
4390: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4391: auto Bcib = Bcsr->column_indices->begin();
4392: auto Bcie = Bcsr->column_indices->end();
4393: thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4394: #endif
4395: auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4396: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4397: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4398: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4399: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4400: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4401: auto p1 = Ccusp->cooPerm->begin();
4402: auto p2 = Ccusp->cooPerm->begin();
4403: thrust::advance(p2,Annz);
4404: PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4405: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4406: thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4407: #endif
4408: auto cci = thrust::make_counting_iterator(zero);
4409: auto cce = thrust::make_counting_iterator(c->nz);
4410: #if 0 //Errors on SUMMIT cuda 11.1.0
4411: PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4412: #else
4413: auto pred = thrust::identity<int>();
4414: PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4415: PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4416: #endif
4417: stat = cusparseXcoo2csr(Ccusp->handle,
4418: Ccoo->data().get(),
4419: c->nz,
4420: m,
4421: Ccsr->row_offsets->data().get(),
4422: CUSPARSE_INDEX_BASE_ZERO);stat;
4423: PetscLogGpuTimeEnd();
4424: delete wPerm;
4425: delete Acoo;
4426: delete Bcoo;
4427: delete Ccoo;
4428: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4429: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4430: Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4431: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4432: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
4433: #endif
4434: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4435: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4436: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4437: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4438: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4439: CsrMatrix *CcsrT = new CsrMatrix;
4440: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4441: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4443: (*C)->form_explicit_transpose = PETSC_TRUE;
4444: (*C)->transupdated = PETSC_TRUE;
4445: Ccusp->rowoffsets_gpu = NULL;
4446: CmatT->cprowIndices = NULL;
4447: CmatT->mat = CcsrT;
4448: CcsrT->num_rows = n;
4449: CcsrT->num_cols = m;
4450: CcsrT->num_entries = c->nz;
4452: CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4453: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4454: CcsrT->values = new THRUSTARRAY(c->nz);
4456: PetscLogGpuTimeBegin();
4457: auto rT = CcsrT->row_offsets->begin();
4458: if (AT) {
4459: rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4460: thrust::advance(rT,-1);
4461: }
4462: if (BT) {
4463: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4464: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4465: thrust::copy(titb,tite,rT);
4466: }
4467: auto cT = CcsrT->column_indices->begin();
4468: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4469: if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4470: auto vT = CcsrT->values->begin();
4471: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4472: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4473: PetscLogGpuTimeEnd();
4475: cusparseCreateMatDescr(&CmatT->descr);
4476: cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);
4477: cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4478: cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));
4479: cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));
4480: cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4481: cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4482: cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
4483: cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4484: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4485: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4486: CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4487: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4488: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
4489: #endif
4490: Ccusp->matTranspose = CmatT;
4491: }
4492: }
4494: c->singlemalloc = PETSC_FALSE;
4495: c->free_a = PETSC_TRUE;
4496: c->free_ij = PETSC_TRUE;
4497: PetscMalloc1(m+1,&c->i);
4498: PetscMalloc1(c->nz,&c->j);
4499: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4500: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4501: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4502: ii = *Ccsr->row_offsets;
4503: jj = *Ccsr->column_indices;
4504: cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4505: cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4506: } else {
4507: cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4508: cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4509: }
4510: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4511: PetscMalloc1(m,&c->ilen);
4512: PetscMalloc1(m,&c->imax);
4513: c->maxnz = c->nz;
4514: c->nonzerorowcnt = 0;
4515: c->rmax = 0;
4516: for (i = 0; i < m; i++) {
4517: const PetscInt nn = c->i[i+1] - c->i[i];
4518: c->ilen[i] = c->imax[i] = nn;
4519: c->nonzerorowcnt += (PetscInt)!!nn;
4520: c->rmax = PetscMax(c->rmax,nn);
4521: }
4522: MatMarkDiagonal_SeqAIJ(*C);
4523: PetscMalloc1(c->nz,&c->a);
4524: (*C)->nonzerostate++;
4525: PetscLayoutSetUp((*C)->rmap);
4526: PetscLayoutSetUp((*C)->cmap);
4527: Ccusp->nonzerostate = (*C)->nonzerostate;
4528: (*C)->preallocated = PETSC_TRUE;
4529: } else {
4531: c = (Mat_SeqAIJ*)(*C)->data;
4532: if (c->nz) {
4533: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4537: MatSeqAIJCUSPARSECopyToGPU(A);
4538: MatSeqAIJCUSPARSECopyToGPU(B);
4541: Acsr = (CsrMatrix*)Acusp->mat->mat;
4542: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4543: Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4549: auto pmid = Ccusp->cooPerm->begin();
4550: thrust::advance(pmid,Acsr->num_entries);
4551: PetscLogGpuTimeBegin();
4552: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4553: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4554: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4555: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4556: thrust::for_each(zibait,zieait,VecCUDAEquals());
4557: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4558: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4559: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4560: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4561: thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4562: MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4563: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4565: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4566: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4567: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4568: CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4569: auto vT = CcsrT->values->begin();
4570: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4571: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4572: (*C)->transupdated = PETSC_TRUE;
4573: }
4574: PetscLogGpuTimeEnd();
4575: }
4576: }
4577: PetscObjectStateIncrease((PetscObject)*C);
4578: (*C)->assembled = PETSC_TRUE;
4579: (*C)->was_assembled = PETSC_FALSE;
4580: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4581: return 0;
4582: }
4584: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4585: {
4586: bool dmem;
4587: const PetscScalar *av;
4589: dmem = isCudaMem(v);
4590: MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4591: if (n && idx) {
4592: THRUSTINTARRAY widx(n);
4593: widx.assign(idx,idx+n);
4594: PetscLogCpuToGpu(n*sizeof(PetscInt));
4596: THRUSTARRAY *w = NULL;
4597: thrust::device_ptr<PetscScalar> dv;
4598: if (dmem) {
4599: dv = thrust::device_pointer_cast(v);
4600: } else {
4601: w = new THRUSTARRAY(n);
4602: dv = w->data();
4603: }
4604: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4606: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4607: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4608: thrust::for_each(zibit,zieit,VecCUDAEquals());
4609: if (w) {
4610: cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
4611: }
4612: delete w;
4613: } else {
4614: cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
4615: }
4616: if (!dmem) PetscLogCpuToGpu(n*sizeof(PetscScalar));
4617: MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4618: return 0;
4619: }