Actual source code: aijcusparse.cu
petsc-3.14.6 2021-03-30
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22: typedef enum {
23: CUSPARSE_MV_ALG_DEFAULT = 0,
24: CUSPARSE_COOMV_ALG = 1,
25: CUSPARSE_CSRMV_ALG1 = 2,
26: CUSPARSE_CSRMV_ALG2 = 3
27: } cusparseSpMVAlg_t;
29: typedef enum {
30: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
32: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
33: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
34: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
35: CUSPARSE_SPMM_ALG_DEFAULT = 0,
36: CUSPARSE_SPMM_COO_ALG1 = 1,
37: CUSPARSE_SPMM_COO_ALG2 = 2,
38: CUSPARSE_SPMM_COO_ALG3 = 3,
39: CUSPARSE_SPMM_COO_ALG4 = 5,
40: CUSPARSE_SPMM_CSR_ALG1 = 4,
41: CUSPARSE_SPMM_CSR_ALG2 = 6,
42: } cusparseSpMMAlg_t;
44: typedef enum {
45: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
47: } cusparseCsr2CscAlg_t;
48: */
49: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52: #endif
54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83: {
84: cusparseStatus_t stat;
85: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
88: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
89: cusparsestruct->stream = stream;
90: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
91: return(0);
92: }
94: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
95: {
96: cusparseStatus_t stat;
97: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
100: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101: if (cusparsestruct->handle != handle) {
102: if (cusparsestruct->handle) {
103: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
104: }
105: cusparsestruct->handle = handle;
106: }
107: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
108: return(0);
109: }
111: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
112: {
113: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
116: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117: if (cusparsestruct->handle) cusparsestruct->handle = 0;
118: return(0);
119: }
121: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
122: {
124: *type = MATSOLVERCUSPARSE;
125: return(0);
126: }
128: /*MC
129: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
130: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
131: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
132: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
133: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
134: algorithms are not recommended. This class does NOT support direct solver operations.
136: Level: beginner
138: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
139: M*/
141: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
142: {
144: PetscInt n = A->rmap->n;
147: MatCreate(PetscObjectComm((PetscObject)A),B);
148: MatSetSizes(*B,n,n,n,n);
149: (*B)->factortype = ftype;
150: (*B)->useordering = PETSC_TRUE;
151: MatSetType(*B,MATSEQAIJCUSPARSE);
153: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
154: MatSetBlockSizesFromMats(*B,A,A);
155: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
156: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
157: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
158: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
159: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
160: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
162: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
163: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
164: return(0);
165: }
167: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
168: {
169: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
172: switch (op) {
173: case MAT_CUSPARSE_MULT:
174: cusparsestruct->format = format;
175: break;
176: case MAT_CUSPARSE_ALL:
177: cusparsestruct->format = format;
178: break;
179: default:
180: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
181: }
182: return(0);
183: }
185: /*@
186: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
187: operation. Only the MatMult operation can use different GPU storage formats
188: for MPIAIJCUSPARSE matrices.
189: Not Collective
191: Input Parameters:
192: + A - Matrix of type SEQAIJCUSPARSE
193: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
194: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
196: Output Parameter:
198: Level: intermediate
200: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
201: @*/
202: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
203: {
208: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
209: return(0);
210: }
212: /*@
213: MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
215: Collective on mat
217: Input Parameters:
218: + A - Matrix of type SEQAIJCUSPARSE
219: - transgen - the boolean flag
221: Level: intermediate
223: .seealso: MATSEQAIJCUSPARSE
224: @*/
225: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
226: {
228: PetscBool flg;
232: PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
233: if (flg) {
234: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
236: if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
237: cusp->transgen = transgen;
238: if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
239: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
240: }
241: }
242: return(0);
243: }
245: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
246: {
247: PetscErrorCode ierr;
248: MatCUSPARSEStorageFormat format;
249: PetscBool flg;
250: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
253: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
254: if (A->factortype == MAT_FACTOR_NONE) {
255: PetscBool transgen = cusparsestruct->transgen;
257: PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
258: if (flg) {MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);}
260: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
261: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
262: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
264: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
265: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
266: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
267: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
268: cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
269: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
270: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
271: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
272: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
274: cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
275: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
276: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
277: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
279: cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
280: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
281: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
282: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
283: #endif
284: }
285: PetscOptionsTail();
286: return(0);
287: }
289: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
290: {
294: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
295: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
296: return(0);
297: }
299: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300: {
304: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
305: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
306: return(0);
307: }
309: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
310: {
314: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
315: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
316: return(0);
317: }
319: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320: {
324: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
325: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326: return(0);
327: }
329: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
330: {
331: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
332: PetscInt n = A->rmap->n;
333: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
335: cusparseStatus_t stat;
336: const PetscInt *ai = a->i,*aj = a->j,*vi;
337: const MatScalar *aa = a->a,*v;
338: PetscInt *AiLo, *AjLo;
339: PetscScalar *AALo;
340: PetscInt i,nz, nzLower, offset, rowOffset;
341: PetscErrorCode ierr;
342: cudaError_t cerr;
345: if (!n) return(0);
346: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
347: try {
348: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
349: nzLower=n+ai[n]-ai[1];
351: /* Allocate Space for the lower triangular matrix */
352: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
353: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
354: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
356: /* Fill the lower triangular matrix */
357: AiLo[0] = (PetscInt) 0;
358: AiLo[n] = nzLower;
359: AjLo[0] = (PetscInt) 0;
360: AALo[0] = (MatScalar) 1.0;
361: v = aa;
362: vi = aj;
363: offset = 1;
364: rowOffset= 1;
365: for (i=1; i<n; i++) {
366: nz = ai[i+1] - ai[i];
367: /* additional 1 for the term on the diagonal */
368: AiLo[i] = rowOffset;
369: rowOffset += nz+1;
371: PetscArraycpy(&(AjLo[offset]), vi, nz);
372: PetscArraycpy(&(AALo[offset]), v, nz);
374: offset += nz;
375: AjLo[offset] = (PetscInt) i;
376: AALo[offset] = (MatScalar) 1.0;
377: offset += 1;
379: v += nz;
380: vi += nz;
381: }
383: /* allocate space for the triangular factor information */
384: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
386: /* Create the matrix description */
387: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
388: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
389: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
390: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
391: #else
392: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
393: #endif
394: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
395: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
397: /* set the operation */
398: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
400: /* set the matrix */
401: loTriFactor->csrMat = new CsrMatrix;
402: loTriFactor->csrMat->num_rows = n;
403: loTriFactor->csrMat->num_cols = n;
404: loTriFactor->csrMat->num_entries = nzLower;
406: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
409: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
412: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
415: /* Create the solve analysis information */
416: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
417: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
418: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
419: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
420: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
421: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
422: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
423: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
424: #endif
426: /* perform the solve analysis */
427: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
428: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
429: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
430: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
431: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
432: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
433: #endif
434: );CHKERRCUSPARSE(stat);
436: /* assign the pointer. Is this really necessary? */
437: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
439: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
440: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
441: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
442: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
443: } catch(char *ex) {
444: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
445: }
446: }
447: return(0);
448: }
450: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
451: {
452: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
453: PetscInt n = A->rmap->n;
454: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
455: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
456: cusparseStatus_t stat;
457: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
458: const MatScalar *aa = a->a,*v;
459: PetscInt *AiUp, *AjUp;
460: PetscScalar *AAUp;
461: PetscInt i,nz, nzUpper, offset;
462: PetscErrorCode ierr;
463: cudaError_t cerr;
466: if (!n) return(0);
467: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
468: try {
469: /* next, figure out the number of nonzeros in the upper triangular matrix. */
470: nzUpper = adiag[0]-adiag[n];
472: /* Allocate Space for the upper triangular matrix */
473: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
474: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
475: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
477: /* Fill the upper triangular matrix */
478: AiUp[0]=(PetscInt) 0;
479: AiUp[n]=nzUpper;
480: offset = nzUpper;
481: for (i=n-1; i>=0; i--) {
482: v = aa + adiag[i+1] + 1;
483: vi = aj + adiag[i+1] + 1;
485: /* number of elements NOT on the diagonal */
486: nz = adiag[i] - adiag[i+1]-1;
488: /* decrement the offset */
489: offset -= (nz+1);
491: /* first, set the diagonal elements */
492: AjUp[offset] = (PetscInt) i;
493: AAUp[offset] = (MatScalar)1./v[nz];
494: AiUp[i] = AiUp[i+1] - (nz+1);
496: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
497: PetscArraycpy(&(AAUp[offset+1]), v, nz);
498: }
500: /* allocate space for the triangular factor information */
501: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
503: /* Create the matrix description */
504: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
505: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
506: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
507: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
508: #else
509: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
510: #endif
511: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
512: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
514: /* set the operation */
515: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
517: /* set the matrix */
518: upTriFactor->csrMat = new CsrMatrix;
519: upTriFactor->csrMat->num_rows = n;
520: upTriFactor->csrMat->num_cols = n;
521: upTriFactor->csrMat->num_entries = nzUpper;
523: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
524: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
526: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
527: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
529: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
530: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
532: /* Create the solve analysis information */
533: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
534: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
535: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
536: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
537: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
538: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
539: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
540: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
541: #endif
543: /* perform the solve analysis */
544: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
545: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
546: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
547: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
548: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
549: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
550: #endif
551: );CHKERRCUSPARSE(stat);
553: /* assign the pointer. Is this really necessary? */
554: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
556: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
557: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
558: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
559: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
560: } catch(char *ex) {
561: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
562: }
563: }
564: return(0);
565: }
567: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
568: {
569: PetscErrorCode ierr;
570: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
571: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
572: IS isrow = a->row,iscol = a->icol;
573: PetscBool row_identity,col_identity;
574: const PetscInt *r,*c;
575: PetscInt n = A->rmap->n;
578: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
579: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
580: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
582: cusparseTriFactors->workVector = new THRUSTARRAY(n);
583: cusparseTriFactors->nnz=a->nz;
585: A->offloadmask = PETSC_OFFLOAD_BOTH;
586: /* lower triangular indices */
587: ISGetIndices(isrow,&r);
588: ISIdentity(isrow,&row_identity);
589: if (!row_identity) {
590: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
591: cusparseTriFactors->rpermIndices->assign(r, r+n);
592: }
593: ISRestoreIndices(isrow,&r);
595: /* upper triangular indices */
596: ISGetIndices(iscol,&c);
597: ISIdentity(iscol,&col_identity);
598: if (!col_identity) {
599: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
600: cusparseTriFactors->cpermIndices->assign(c, c+n);
601: }
603: if (!row_identity && !col_identity) {
604: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
605: } else if (!row_identity) {
606: PetscLogCpuToGpu(n*sizeof(PetscInt));
607: } else if (!col_identity) {
608: PetscLogCpuToGpu(n*sizeof(PetscInt));
609: }
611: ISRestoreIndices(iscol,&c);
612: return(0);
613: }
615: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
616: {
617: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
618: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
619: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
620: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
621: cusparseStatus_t stat;
622: PetscErrorCode ierr;
623: cudaError_t cerr;
624: PetscInt *AiUp, *AjUp;
625: PetscScalar *AAUp;
626: PetscScalar *AALo;
627: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
628: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
629: const PetscInt *ai = b->i,*aj = b->j,*vj;
630: const MatScalar *aa = b->a,*v;
633: if (!n) return(0);
634: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
635: try {
636: /* Allocate Space for the upper triangular matrix */
637: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
638: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
639: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
640: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
642: /* Fill the upper triangular matrix */
643: AiUp[0]=(PetscInt) 0;
644: AiUp[n]=nzUpper;
645: offset = 0;
646: for (i=0; i<n; i++) {
647: /* set the pointers */
648: v = aa + ai[i];
649: vj = aj + ai[i];
650: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
652: /* first, set the diagonal elements */
653: AjUp[offset] = (PetscInt) i;
654: AAUp[offset] = (MatScalar)1.0/v[nz];
655: AiUp[i] = offset;
656: AALo[offset] = (MatScalar)1.0/v[nz];
658: offset+=1;
659: if (nz>0) {
660: PetscArraycpy(&(AjUp[offset]), vj, nz);
661: PetscArraycpy(&(AAUp[offset]), v, nz);
662: for (j=offset; j<offset+nz; j++) {
663: AAUp[j] = -AAUp[j];
664: AALo[j] = AAUp[j]/v[nz];
665: }
666: offset+=nz;
667: }
668: }
670: /* allocate space for the triangular factor information */
671: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
673: /* Create the matrix description */
674: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
675: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
676: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
677: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
678: #else
679: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
680: #endif
681: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
682: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
684: /* set the matrix */
685: upTriFactor->csrMat = new CsrMatrix;
686: upTriFactor->csrMat->num_rows = A->rmap->n;
687: upTriFactor->csrMat->num_cols = A->cmap->n;
688: upTriFactor->csrMat->num_entries = a->nz;
690: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
691: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
693: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
694: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
696: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
697: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
699: /* set the operation */
700: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
702: /* Create the solve analysis information */
703: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
704: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
705: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
706: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
707: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
708: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
709: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
710: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
711: #endif
713: /* perform the solve analysis */
714: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
715: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
716: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
717: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
718: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
719: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
720: #endif
721: );CHKERRCUSPARSE(stat);
723: /* assign the pointer. Is this really necessary? */
724: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
726: /* allocate space for the triangular factor information */
727: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
729: /* Create the matrix description */
730: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
731: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
733: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734: #else
735: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736: #endif
737: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
740: /* set the operation */
741: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
743: /* set the matrix */
744: loTriFactor->csrMat = new CsrMatrix;
745: loTriFactor->csrMat->num_rows = A->rmap->n;
746: loTriFactor->csrMat->num_cols = A->cmap->n;
747: loTriFactor->csrMat->num_entries = a->nz;
749: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
750: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
752: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
753: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
755: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
756: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
757: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
759: /* Create the solve analysis information */
760: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
763: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
764: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
765: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
766: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768: #endif
770: /* perform the solve analysis */
771: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
772: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
773: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
774: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
775: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
776: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
777: #endif
778: );CHKERRCUSPARSE(stat);
780: /* assign the pointer. Is this really necessary? */
781: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
783: A->offloadmask = PETSC_OFFLOAD_BOTH;
784: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
785: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
786: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
787: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
788: } catch(char *ex) {
789: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
790: }
791: }
792: return(0);
793: }
795: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
796: {
797: PetscErrorCode ierr;
798: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
799: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
800: IS ip = a->row;
801: const PetscInt *rip;
802: PetscBool perm_identity;
803: PetscInt n = A->rmap->n;
806: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
807: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
808: cusparseTriFactors->workVector = new THRUSTARRAY(n);
809: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
811: /* lower triangular indices */
812: ISGetIndices(ip,&rip);
813: ISIdentity(ip,&perm_identity);
814: if (!perm_identity) {
815: IS iip;
816: const PetscInt *irip;
818: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
819: ISGetIndices(iip,&irip);
820: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
821: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
822: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
823: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
824: ISRestoreIndices(iip,&irip);
825: ISDestroy(&iip);
826: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
827: }
828: ISRestoreIndices(ip,&rip);
829: return(0);
830: }
832: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
833: {
834: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
835: IS isrow = b->row,iscol = b->col;
836: PetscBool row_identity,col_identity;
840: MatLUFactorNumeric_SeqAIJ(B,A,info);
841: B->offloadmask = PETSC_OFFLOAD_CPU;
842: /* determine which version of MatSolve needs to be used. */
843: ISIdentity(isrow,&row_identity);
844: ISIdentity(iscol,&col_identity);
845: if (row_identity && col_identity) {
846: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
847: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
848: B->ops->matsolve = NULL;
849: B->ops->matsolvetranspose = NULL;
850: } else {
851: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
852: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
853: B->ops->matsolve = NULL;
854: B->ops->matsolvetranspose = NULL;
855: }
857: /* get the triangular factors */
858: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
859: return(0);
860: }
862: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
863: {
864: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
865: IS ip = b->row;
866: PetscBool perm_identity;
870: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
871: B->offloadmask = PETSC_OFFLOAD_CPU;
872: /* determine which version of MatSolve needs to be used. */
873: ISIdentity(ip,&perm_identity);
874: if (perm_identity) {
875: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
876: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
877: B->ops->matsolve = NULL;
878: B->ops->matsolvetranspose = NULL;
879: } else {
880: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
881: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
882: B->ops->matsolve = NULL;
883: B->ops->matsolvetranspose = NULL;
884: }
886: /* get the triangular factors */
887: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
888: return(0);
889: }
891: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
892: {
893: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
894: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
895: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
896: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
897: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
898: cusparseStatus_t stat;
899: cusparseIndexBase_t indexBase;
900: cusparseMatrixType_t matrixType;
901: cusparseFillMode_t fillMode;
902: cusparseDiagType_t diagType;
906: /*********************************************/
907: /* Now the Transpose of the Lower Tri Factor */
908: /*********************************************/
910: /* allocate space for the transpose of the lower triangular factor */
911: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
913: /* set the matrix descriptors of the lower triangular factor */
914: matrixType = cusparseGetMatType(loTriFactor->descr);
915: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
916: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
917: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918: diagType = cusparseGetMatDiagType(loTriFactor->descr);
920: /* Create the matrix description */
921: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
922: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
923: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
924: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
925: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
927: /* set the operation */
928: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
930: /* allocate GPU space for the CSC of the lower triangular factor*/
931: loTriFactorT->csrMat = new CsrMatrix;
932: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
933: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
934: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
935: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
936: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
937: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
939: /* compute the transpose of the lower triangular factor, i.e. the CSC */
940: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
941: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
942: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
943: loTriFactor->csrMat->values->data().get(),
944: loTriFactor->csrMat->row_offsets->data().get(),
945: loTriFactor->csrMat->column_indices->data().get(),
946: loTriFactorT->csrMat->values->data().get(),
947: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
948: CUSPARSE_ACTION_NUMERIC,indexBase,
949: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
950: cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
951: #endif
953: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
954: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
955: loTriFactor->csrMat->values->data().get(),
956: loTriFactor->csrMat->row_offsets->data().get(),
957: loTriFactor->csrMat->column_indices->data().get(),
958: loTriFactorT->csrMat->values->data().get(),
959: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
960: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
961: CUSPARSE_ACTION_NUMERIC, indexBase,
962: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
963: #else
964: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
965: CUSPARSE_ACTION_NUMERIC, indexBase
966: #endif
967: );CHKERRCUSPARSE(stat);
969: /* Create the solve analysis information */
970: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
971: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
972: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
973: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
974: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
975: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
976: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
977: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
978: #endif
980: /* perform the solve analysis */
981: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
982: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
983: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
984: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
985: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
986: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
987: #endif
988: );CHKERRCUSPARSE(stat);
990: /* assign the pointer. Is this really necessary? */
991: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
993: /*********************************************/
994: /* Now the Transpose of the Upper Tri Factor */
995: /*********************************************/
997: /* allocate space for the transpose of the upper triangular factor */
998: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
1000: /* set the matrix descriptors of the upper triangular factor */
1001: matrixType = cusparseGetMatType(upTriFactor->descr);
1002: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1003: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1007: /* Create the matrix description */
1008: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1009: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1010: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1011: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1012: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1014: /* set the operation */
1015: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1017: /* allocate GPU space for the CSC of the upper triangular factor*/
1018: upTriFactorT->csrMat = new CsrMatrix;
1019: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1020: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1021: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1022: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1023: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1024: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1026: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1027: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1028: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1029: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1030: upTriFactor->csrMat->values->data().get(),
1031: upTriFactor->csrMat->row_offsets->data().get(),
1032: upTriFactor->csrMat->column_indices->data().get(),
1033: upTriFactorT->csrMat->values->data().get(),
1034: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035: CUSPARSE_ACTION_NUMERIC,indexBase,
1036: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1037: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1038: #endif
1040: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1041: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1042: upTriFactor->csrMat->values->data().get(),
1043: upTriFactor->csrMat->row_offsets->data().get(),
1044: upTriFactor->csrMat->column_indices->data().get(),
1045: upTriFactorT->csrMat->values->data().get(),
1046: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048: CUSPARSE_ACTION_NUMERIC, indexBase,
1049: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1050: #else
1051: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1052: CUSPARSE_ACTION_NUMERIC, indexBase
1053: #endif
1054: );CHKERRCUSPARSE(stat);
1056: /* Create the solve analysis information */
1057: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1058: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1059: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1060: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1061: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1062: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1063: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1064: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1065: #endif
1067: /* perform the solve analysis */
1068: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1070: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1071: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1072: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1073: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1074: #endif
1075: );CHKERRCUSPARSE(stat);
1077: /* assign the pointer. Is this really necessary? */
1078: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079: return(0);
1080: }
1082: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1083: {
1084: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1085: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1086: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1087: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1088: cusparseStatus_t stat;
1089: cusparseIndexBase_t indexBase;
1090: cudaError_t err;
1091: PetscErrorCode ierr;
1094: if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
1095: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1096: PetscLogGpuTimeBegin();
1097: /* create cusparse matrix */
1098: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1099: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1100: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1101: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1102: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1104: /* set alpha and beta */
1105: err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1106: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1107: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1108: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1109: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1110: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1111: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1113: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1114: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1115: CsrMatrix *matrixT= new CsrMatrix;
1116: matrixT->num_rows = A->cmap->n;
1117: matrixT->num_cols = A->rmap->n;
1118: matrixT->num_entries = a->nz;
1119: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1120: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1121: matrixT->values = new THRUSTARRAY(a->nz);
1123: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1124: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1126: /* compute the transpose, i.e. the CSC */
1127: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1128: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1129: A->cmap->n, matrix->num_entries,
1130: matrix->values->data().get(),
1131: cusparsestruct->rowoffsets_gpu->data().get(),
1132: matrix->column_indices->data().get(),
1133: matrixT->values->data().get(),
1134: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1135: CUSPARSE_ACTION_NUMERIC,indexBase,
1136: cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1137: err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1138: #endif
1140: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1141: A->cmap->n, matrix->num_entries,
1142: matrix->values->data().get(),
1143: cusparsestruct->rowoffsets_gpu->data().get(),
1144: matrix->column_indices->data().get(),
1145: matrixT->values->data().get(),
1146: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1148: CUSPARSE_ACTION_NUMERIC,indexBase,
1149: cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1150: #else
1151: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1152: CUSPARSE_ACTION_NUMERIC, indexBase
1153: #endif
1154: );CHKERRCUSPARSE(stat);
1155: matstructT->mat = matrixT;
1157: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1158: stat = cusparseCreateCsr(&matstructT->matDescr,
1159: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1160: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1161: matrixT->values->data().get(),
1162: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1163: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1164: #endif
1165: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1166: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1168: #else
1169: CsrMatrix *temp = new CsrMatrix;
1170: CsrMatrix *tempT = new CsrMatrix;
1171: /* First convert HYB to CSR */
1172: temp->num_rows = A->rmap->n;
1173: temp->num_cols = A->cmap->n;
1174: temp->num_entries = a->nz;
1175: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1176: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1177: temp->values = new THRUSTARRAY(a->nz);
1179: stat = cusparse_hyb2csr(cusparsestruct->handle,
1180: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1181: temp->values->data().get(),
1182: temp->row_offsets->data().get(),
1183: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1185: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1186: tempT->num_rows = A->rmap->n;
1187: tempT->num_cols = A->cmap->n;
1188: tempT->num_entries = a->nz;
1189: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1190: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1191: tempT->values = new THRUSTARRAY(a->nz);
1193: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1194: temp->num_cols, temp->num_entries,
1195: temp->values->data().get(),
1196: temp->row_offsets->data().get(),
1197: temp->column_indices->data().get(),
1198: tempT->values->data().get(),
1199: tempT->column_indices->data().get(),
1200: tempT->row_offsets->data().get(),
1201: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1203: /* Last, convert CSC to HYB */
1204: cusparseHybMat_t hybMat;
1205: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1206: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1207: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1208: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1209: matstructT->descr, tempT->values->data().get(),
1210: tempT->row_offsets->data().get(),
1211: tempT->column_indices->data().get(),
1212: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1214: /* assign the pointer */
1215: matstructT->mat = hybMat;
1216: /* delete temporaries */
1217: if (tempT) {
1218: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1219: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1220: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1221: delete (CsrMatrix*) tempT;
1222: }
1223: if (temp) {
1224: if (temp->values) delete (THRUSTARRAY*) temp->values;
1225: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1226: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1227: delete (CsrMatrix*) temp;
1228: }
1229: #endif
1230: }
1231: err = WaitForCUDA();CHKERRCUDA(err);
1232: PetscLogGpuTimeEnd();
1233: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1234: /* the compressed row indices is not used for matTranspose */
1235: matstructT->cprowIndices = NULL;
1236: /* assign the pointer */
1237: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1238: return(0);
1239: }
1241: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1242: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1243: {
1244: PetscInt n = xx->map->n;
1245: const PetscScalar *barray;
1246: PetscScalar *xarray;
1247: thrust::device_ptr<const PetscScalar> bGPU;
1248: thrust::device_ptr<PetscScalar> xGPU;
1249: cusparseStatus_t stat;
1250: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1251: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1252: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1253: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1254: PetscErrorCode ierr;
1255: cudaError_t cerr;
1258: /* Analyze the matrix and create the transpose ... on the fly */
1259: if (!loTriFactorT && !upTriFactorT) {
1260: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1261: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1262: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1263: }
1265: /* Get the GPU pointers */
1266: VecCUDAGetArrayWrite(xx,&xarray);
1267: VecCUDAGetArrayRead(bb,&barray);
1268: xGPU = thrust::device_pointer_cast(xarray);
1269: bGPU = thrust::device_pointer_cast(barray);
1271: PetscLogGpuTimeBegin();
1272: /* First, reorder with the row permutation */
1273: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1274: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1275: xGPU);
1277: /* First, solve U */
1278: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1279: upTriFactorT->csrMat->num_rows,
1280: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1281: upTriFactorT->csrMat->num_entries,
1282: #endif
1283: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1284: upTriFactorT->csrMat->values->data().get(),
1285: upTriFactorT->csrMat->row_offsets->data().get(),
1286: upTriFactorT->csrMat->column_indices->data().get(),
1287: upTriFactorT->solveInfo,
1288: xarray, tempGPU->data().get()
1289: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1291: #endif
1292: );CHKERRCUSPARSE(stat);
1294: /* Then, solve L */
1295: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1296: loTriFactorT->csrMat->num_rows,
1297: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1298: loTriFactorT->csrMat->num_entries,
1299: #endif
1300: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1301: loTriFactorT->csrMat->values->data().get(),
1302: loTriFactorT->csrMat->row_offsets->data().get(),
1303: loTriFactorT->csrMat->column_indices->data().get(),
1304: loTriFactorT->solveInfo,
1305: tempGPU->data().get(), xarray
1306: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1307: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1308: #endif
1309: );CHKERRCUSPARSE(stat);
1311: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1312: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1313: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1314: tempGPU->begin());
1316: /* Copy the temporary to the full solution. */
1317: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1319: /* restore */
1320: VecCUDARestoreArrayRead(bb,&barray);
1321: VecCUDARestoreArrayWrite(xx,&xarray);
1322: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1323: PetscLogGpuTimeEnd();
1324: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1325: return(0);
1326: }
1328: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1329: {
1330: const PetscScalar *barray;
1331: PetscScalar *xarray;
1332: cusparseStatus_t stat;
1333: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1334: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1335: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1336: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1337: PetscErrorCode ierr;
1338: cudaError_t cerr;
1341: /* Analyze the matrix and create the transpose ... on the fly */
1342: if (!loTriFactorT && !upTriFactorT) {
1343: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1344: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1345: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1346: }
1348: /* Get the GPU pointers */
1349: VecCUDAGetArrayWrite(xx,&xarray);
1350: VecCUDAGetArrayRead(bb,&barray);
1352: PetscLogGpuTimeBegin();
1353: /* First, solve U */
1354: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355: upTriFactorT->csrMat->num_rows,
1356: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357: upTriFactorT->csrMat->num_entries,
1358: #endif
1359: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360: upTriFactorT->csrMat->values->data().get(),
1361: upTriFactorT->csrMat->row_offsets->data().get(),
1362: upTriFactorT->csrMat->column_indices->data().get(),
1363: upTriFactorT->solveInfo,
1364: barray, tempGPU->data().get()
1365: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1366: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1367: #endif
1368: );CHKERRCUSPARSE(stat);
1370: /* Then, solve L */
1371: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1372: loTriFactorT->csrMat->num_rows,
1373: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1374: loTriFactorT->csrMat->num_entries,
1375: #endif
1376: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1377: loTriFactorT->csrMat->values->data().get(),
1378: loTriFactorT->csrMat->row_offsets->data().get(),
1379: loTriFactorT->csrMat->column_indices->data().get(),
1380: loTriFactorT->solveInfo,
1381: tempGPU->data().get(), xarray
1382: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1384: #endif
1385: );CHKERRCUSPARSE(stat);
1387: /* restore */
1388: VecCUDARestoreArrayRead(bb,&barray);
1389: VecCUDARestoreArrayWrite(xx,&xarray);
1390: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1391: PetscLogGpuTimeEnd();
1392: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1393: return(0);
1394: }
1396: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397: {
1398: const PetscScalar *barray;
1399: PetscScalar *xarray;
1400: thrust::device_ptr<const PetscScalar> bGPU;
1401: thrust::device_ptr<PetscScalar> xGPU;
1402: cusparseStatus_t stat;
1403: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1404: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1405: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1406: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1407: PetscErrorCode ierr;
1408: cudaError_t cerr;
1412: /* Get the GPU pointers */
1413: VecCUDAGetArrayWrite(xx,&xarray);
1414: VecCUDAGetArrayRead(bb,&barray);
1415: xGPU = thrust::device_pointer_cast(xarray);
1416: bGPU = thrust::device_pointer_cast(barray);
1418: PetscLogGpuTimeBegin();
1419: /* First, reorder with the row permutation */
1420: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1421: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1422: tempGPU->begin());
1424: /* Next, solve L */
1425: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1426: loTriFactor->csrMat->num_rows,
1427: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1428: loTriFactor->csrMat->num_entries,
1429: #endif
1430: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1431: loTriFactor->csrMat->values->data().get(),
1432: loTriFactor->csrMat->row_offsets->data().get(),
1433: loTriFactor->csrMat->column_indices->data().get(),
1434: loTriFactor->solveInfo,
1435: tempGPU->data().get(), xarray
1436: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1438: #endif
1439: );CHKERRCUSPARSE(stat);
1441: /* Then, solve U */
1442: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1443: upTriFactor->csrMat->num_rows,
1444: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1445: upTriFactor->csrMat->num_entries,
1446: #endif
1447: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1448: upTriFactor->csrMat->values->data().get(),
1449: upTriFactor->csrMat->row_offsets->data().get(),
1450: upTriFactor->csrMat->column_indices->data().get(),
1451: upTriFactor->solveInfo,
1452: xarray, tempGPU->data().get()
1453: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1454: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1455: #endif
1456: );CHKERRCUSPARSE(stat);
1458: /* Last, reorder with the column permutation */
1459: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1460: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1461: xGPU);
1463: VecCUDARestoreArrayRead(bb,&barray);
1464: VecCUDARestoreArrayWrite(xx,&xarray);
1465: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1466: PetscLogGpuTimeEnd();
1467: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1468: return(0);
1469: }
1471: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1472: {
1473: const PetscScalar *barray;
1474: PetscScalar *xarray;
1475: cusparseStatus_t stat;
1476: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1478: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1479: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480: PetscErrorCode ierr;
1481: cudaError_t cerr;
1484: /* Get the GPU pointers */
1485: VecCUDAGetArrayWrite(xx,&xarray);
1486: VecCUDAGetArrayRead(bb,&barray);
1488: PetscLogGpuTimeBegin();
1489: /* First, solve L */
1490: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1491: loTriFactor->csrMat->num_rows,
1492: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1493: loTriFactor->csrMat->num_entries,
1494: #endif
1495: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1496: loTriFactor->csrMat->values->data().get(),
1497: loTriFactor->csrMat->row_offsets->data().get(),
1498: loTriFactor->csrMat->column_indices->data().get(),
1499: loTriFactor->solveInfo,
1500: barray, tempGPU->data().get()
1501: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1502: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1503: #endif
1504: );CHKERRCUSPARSE(stat);
1506: /* Next, solve U */
1507: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1508: upTriFactor->csrMat->num_rows,
1509: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1510: upTriFactor->csrMat->num_entries,
1511: #endif
1512: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1513: upTriFactor->csrMat->values->data().get(),
1514: upTriFactor->csrMat->row_offsets->data().get(),
1515: upTriFactor->csrMat->column_indices->data().get(),
1516: upTriFactor->solveInfo,
1517: tempGPU->data().get(), xarray
1518: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1519: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1520: #endif
1521: );CHKERRCUSPARSE(stat);
1523: VecCUDARestoreArrayRead(bb,&barray);
1524: VecCUDARestoreArrayWrite(xx,&xarray);
1525: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1526: PetscLogGpuTimeEnd();
1527: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1528: return(0);
1529: }
1531: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1532: {
1533: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1534: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1535: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1536: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1537: PetscErrorCode ierr;
1538: cusparseStatus_t stat;
1539: cudaError_t err;
1542: if (A->boundtocpu) return(0);
1543: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1544: if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1545: /* Copy values only */
1546: CsrMatrix *matrix,*matrixT;
1547: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1549: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1550: matrix->values->assign(a->a, a->a+a->nz);
1551: err = WaitForCUDA();CHKERRCUDA(err);
1552: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1553: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1555: /* Update matT when it was built before */
1556: if (cusparsestruct->matTranspose) {
1557: cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1558: matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1559: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1560: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561: A->cmap->n, matrix->num_entries,
1562: matrix->values->data().get(),
1563: cusparsestruct->rowoffsets_gpu->data().get(),
1564: matrix->column_indices->data().get(),
1565: matrixT->values->data().get(),
1566: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568: CUSPARSE_ACTION_NUMERIC,indexBase,
1569: cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570: #else
1571: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572: CUSPARSE_ACTION_NUMERIC, indexBase
1573: #endif
1574: );CHKERRCUSPARSE(stat);
1575: err = WaitForCUDA();CHKERRCUDA(err);
1576: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1577: }
1578: } else {
1579: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1580: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1581: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1582: delete cusparsestruct->workVector;
1583: delete cusparsestruct->rowoffsets_gpu;
1584: try {
1585: if (a->compressedrow.use) {
1586: m = a->compressedrow.nrows;
1587: ii = a->compressedrow.i;
1588: ridx = a->compressedrow.rindex;
1589: } else {
1590: m = A->rmap->n;
1591: ii = a->i;
1592: ridx = NULL;
1593: }
1594: cusparsestruct->nrows = m;
1596: /* create cusparse matrix */
1597: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1598: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1599: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1600: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1602: err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1603: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1604: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1605: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1606: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1607: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1608: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1610: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1611: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1612: /* set the matrix */
1613: CsrMatrix *mat= new CsrMatrix;
1614: mat->num_rows = m;
1615: mat->num_cols = A->cmap->n;
1616: mat->num_entries = a->nz;
1617: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1618: mat->row_offsets->assign(ii, ii + m+1);
1620: mat->column_indices = new THRUSTINTARRAY32(a->nz);
1621: mat->column_indices->assign(a->j, a->j+a->nz);
1623: mat->values = new THRUSTARRAY(a->nz);
1624: mat->values->assign(a->a, a->a+a->nz);
1626: /* assign the pointer */
1627: matstruct->mat = mat;
1628: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1629: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1630: stat = cusparseCreateCsr(&matstruct->matDescr,
1631: mat->num_rows, mat->num_cols, mat->num_entries,
1632: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1633: mat->values->data().get(),
1634: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1635: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1636: }
1637: #endif
1638: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1639: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1640: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1641: #else
1642: CsrMatrix *mat= new CsrMatrix;
1643: mat->num_rows = m;
1644: mat->num_cols = A->cmap->n;
1645: mat->num_entries = a->nz;
1646: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1647: mat->row_offsets->assign(ii, ii + m+1);
1649: mat->column_indices = new THRUSTINTARRAY32(a->nz);
1650: mat->column_indices->assign(a->j, a->j+a->nz);
1652: mat->values = new THRUSTARRAY(a->nz);
1653: mat->values->assign(a->a, a->a+a->nz);
1655: cusparseHybMat_t hybMat;
1656: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1657: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1658: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1659: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1660: matstruct->descr, mat->values->data().get(),
1661: mat->row_offsets->data().get(),
1662: mat->column_indices->data().get(),
1663: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1664: /* assign the pointer */
1665: matstruct->mat = hybMat;
1667: if (mat) {
1668: if (mat->values) delete (THRUSTARRAY*)mat->values;
1669: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1670: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1671: delete (CsrMatrix*)mat;
1672: }
1673: #endif
1674: }
1676: /* assign the compressed row indices */
1677: if (a->compressedrow.use) {
1678: cusparsestruct->workVector = new THRUSTARRAY(m);
1679: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1680: matstruct->cprowIndices->assign(ridx,ridx+m);
1681: tmp = m;
1682: } else {
1683: cusparsestruct->workVector = NULL;
1684: matstruct->cprowIndices = NULL;
1685: tmp = 0;
1686: }
1687: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1689: /* assign the pointer */
1690: cusparsestruct->mat = matstruct;
1691: } catch(char *ex) {
1692: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1693: }
1694: err = WaitForCUDA();CHKERRCUDA(err);
1695: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1696: cusparsestruct->nonzerostate = A->nonzerostate;
1697: }
1698: A->offloadmask = PETSC_OFFLOAD_BOTH;
1699: }
1700: return(0);
1701: }
1703: struct VecCUDAPlusEquals
1704: {
1705: template <typename Tuple>
1706: __host__ __device__
1707: void operator()(Tuple t)
1708: {
1709: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1710: }
1711: };
1713: struct VecCUDAEqualsReverse
1714: {
1715: template <typename Tuple>
1716: __host__ __device__
1717: void operator()(Tuple t)
1718: {
1719: thrust::get<0>(t) = thrust::get<1>(t);
1720: }
1721: };
1723: struct MatMatCusparse {
1724: PetscBool cisdense;
1725: PetscScalar *Bt;
1726: Mat X;
1727: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1728: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1729: cusparseDnMatDescr_t matBDescr;
1730: cusparseDnMatDescr_t matCDescr;
1731: size_t spmmBufferSize;
1732: void *spmmBuffer;
1733: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1734: #endif
1735: };
1737: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1738: {
1740: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1741: cudaError_t cerr;
1744: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1745: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1746: cusparseStatus_t stat;
1747: if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1748: if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1749: if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1750: #endif
1751: MatDestroy(&mmdata->X);
1752: PetscFree(data);
1753: return(0);
1754: }
1756: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1758: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1759: {
1760: Mat_Product *product = C->product;
1761: Mat A,B;
1762: PetscInt m,n,blda,clda;
1763: PetscBool flg,biscuda;
1764: Mat_SeqAIJCUSPARSE *cusp;
1765: cusparseStatus_t stat;
1766: cusparseOperation_t opA;
1767: const PetscScalar *barray;
1768: PetscScalar *carray;
1769: PetscErrorCode ierr;
1770: MatMatCusparse *mmdata;
1771: Mat_SeqAIJCUSPARSEMultStruct *mat;
1772: CsrMatrix *csrmat;
1773: cudaError_t cerr;
1776: MatCheckProduct(C,1);
1777: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1778: mmdata = (MatMatCusparse*)product->data;
1779: A = product->A;
1780: B = product->B;
1781: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1782: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1783: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1784: Instead of silently accepting the wrong answer, I prefer to raise the error */
1785: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1786: MatSeqAIJCUSPARSECopyToGPU(A);
1787: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1788: switch (product->type) {
1789: case MATPRODUCT_AB:
1790: case MATPRODUCT_PtAP:
1791: mat = cusp->mat;
1792: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1793: m = A->rmap->n;
1794: n = B->cmap->n;
1795: break;
1796: case MATPRODUCT_AtB:
1797: if (!cusp->transgen) {
1798: mat = cusp->mat;
1799: opA = CUSPARSE_OPERATION_TRANSPOSE;
1800: } else {
1801: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1802: mat = cusp->matTranspose;
1803: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1804: }
1805: m = A->cmap->n;
1806: n = B->cmap->n;
1807: break;
1808: case MATPRODUCT_ABt:
1809: case MATPRODUCT_RARt:
1810: mat = cusp->mat;
1811: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1812: m = A->rmap->n;
1813: n = B->rmap->n;
1814: break;
1815: default:
1816: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1817: }
1818: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1819: csrmat = (CsrMatrix*)mat->mat;
1820: /* if the user passed a CPU matrix, copy the data to the GPU */
1821: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1822: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
1823: MatDenseCUDAGetArrayRead(B,&barray);
1825: MatDenseGetLDA(B,&blda);
1826: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1827: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1828: MatDenseGetLDA(mmdata->X,&clda);
1829: } else {
1830: MatDenseCUDAGetArrayWrite(C,&carray);
1831: MatDenseGetLDA(C,&clda);
1832: }
1834: PetscLogGpuTimeBegin();
1835: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1836: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1837: /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1838: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1839: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1840: if (!mmdata->matBDescr) {
1841: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1842: mmdata->Blda = blda;
1843: }
1845: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1846: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1847: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1848: mmdata->Clda = clda;
1849: }
1851: if (!mat->matDescr) {
1852: stat = cusparseCreateCsr(&mat->matDescr,
1853: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1854: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1855: csrmat->values->data().get(),
1856: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858: }
1859: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1860: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1861: mmdata->matCDescr,cusparse_scalartype,
1862: cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1863: if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1864: cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1865: mmdata->initialized = PETSC_TRUE;
1866: } else {
1867: /* to be safe, always update pointers of the mats */
1868: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1869: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1870: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1871: }
1873: /* do cusparseSpMM, which supports transpose on B */
1874: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1875: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1876: mmdata->matCDescr,cusparse_scalartype,
1877: cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1878: #else
1879: PetscInt k;
1880: /* cusparseXcsrmm does not support transpose on B */
1881: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1882: cublasHandle_t cublasv2handle;
1883: cublasStatus_t cerr;
1885: PetscCUBLASGetHandle(&cublasv2handle);
1886: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1887: B->cmap->n,B->rmap->n,
1888: &PETSC_CUSPARSE_ONE ,barray,blda,
1889: &PETSC_CUSPARSE_ZERO,barray,blda,
1890: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1891: blda = B->cmap->n;
1892: k = B->cmap->n;
1893: } else {
1894: k = B->rmap->n;
1895: }
1897: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1898: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1899: csrmat->num_entries,mat->alpha_one,mat->descr,
1900: csrmat->values->data().get(),
1901: csrmat->row_offsets->data().get(),
1902: csrmat->column_indices->data().get(),
1903: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1904: carray,clda);CHKERRCUSPARSE(stat);
1905: #endif
1906: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1907: PetscLogGpuTimeEnd();
1908: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1909: MatDenseCUDARestoreArrayRead(B,&barray);
1910: if (product->type == MATPRODUCT_RARt) {
1911: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1912: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1913: } else if (product->type == MATPRODUCT_PtAP) {
1914: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1915: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1916: } else {
1917: MatDenseCUDARestoreArrayWrite(C,&carray);
1918: }
1919: if (mmdata->cisdense) {
1920: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1921: }
1922: if (!biscuda) {
1923: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1924: }
1925: return(0);
1926: }
1928: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929: {
1930: Mat_Product *product = C->product;
1931: Mat A,B;
1932: PetscInt m,n;
1933: PetscBool cisdense,flg;
1934: PetscErrorCode ierr;
1935: MatMatCusparse *mmdata;
1936: Mat_SeqAIJCUSPARSE *cusp;
1939: MatCheckProduct(C,1);
1940: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1941: A = product->A;
1942: B = product->B;
1943: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1944: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1945: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1946: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1947: switch (product->type) {
1948: case MATPRODUCT_AB:
1949: m = A->rmap->n;
1950: n = B->cmap->n;
1951: break;
1952: case MATPRODUCT_AtB:
1953: m = A->cmap->n;
1954: n = B->cmap->n;
1955: break;
1956: case MATPRODUCT_ABt:
1957: m = A->rmap->n;
1958: n = B->rmap->n;
1959: break;
1960: case MATPRODUCT_PtAP:
1961: m = B->cmap->n;
1962: n = B->cmap->n;
1963: break;
1964: case MATPRODUCT_RARt:
1965: m = B->rmap->n;
1966: n = B->rmap->n;
1967: break;
1968: default:
1969: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1970: }
1971: MatSetSizes(C,m,n,m,n);
1972: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1973: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1974: MatSetType(C,MATSEQDENSECUDA);
1976: /* product data */
1977: PetscNew(&mmdata);
1978: mmdata->cisdense = cisdense;
1979: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1980: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1981: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1982: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1983: }
1984: #endif
1985: /* for these products we need intermediate storage */
1986: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1987: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1988: MatSetType(mmdata->X,MATSEQDENSECUDA);
1989: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1990: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1991: } else {
1992: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1993: }
1994: }
1995: C->product->data = mmdata;
1996: C->product->destroy = MatDestroy_MatMatCusparse;
1998: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1999: return(0);
2000: }
2002: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2004: /* handles dense B */
2005: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2006: {
2007: Mat_Product *product = C->product;
2011: MatCheckProduct(C,1);
2012: if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2013: if (product->A->boundtocpu) {
2014: MatProductSetFromOptions_SeqAIJ_SeqDense(C);
2015: return(0);
2016: }
2017: switch (product->type) {
2018: case MATPRODUCT_AB:
2019: case MATPRODUCT_AtB:
2020: case MATPRODUCT_ABt:
2021: case MATPRODUCT_PtAP:
2022: case MATPRODUCT_RARt:
2023: C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2024: default:
2025: break;
2026: }
2027: return(0);
2028: }
2030: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2031: {
2035: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2036: return(0);
2037: }
2039: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2040: {
2044: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2045: return(0);
2046: }
2048: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2049: {
2053: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2054: return(0);
2055: }
2057: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2058: {
2062: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2063: return(0);
2064: }
2066: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2067: {
2071: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2072: return(0);
2073: }
2075: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2076: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2077: {
2078: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2079: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2081: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2082: PetscErrorCode ierr;
2083: cudaError_t cerr;
2084: cusparseStatus_t stat;
2085: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086: PetscBool compressed;
2087: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088: PetscInt nx,ny;
2089: #endif
2092: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2093: if (!a->nonzerorowcnt) {
2094: if (!yy) {VecSet_SeqCUDA(zz,0);}
2095: return(0);
2096: }
2097: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2098: MatSeqAIJCUSPARSECopyToGPU(A);
2099: if (!trans) {
2100: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2101: } else {
2102: if (herm || !cusparsestruct->transgen) {
2103: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2104: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2105: } else {
2106: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEGenerateTransposeForMult(A);}
2107: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2108: }
2109: }
2110: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2111: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2113: try {
2114: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2115: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2116: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
2118: PetscLogGpuTimeBegin();
2119: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2120: /* z = A x + beta y.
2121: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2122: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2123: */
2124: xptr = xarray;
2125: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2126: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2127: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2129: allocated to accommodate different uses. So we get the length info directly from mat.
2130: */
2131: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2132: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2133: nx = mat->num_cols;
2134: ny = mat->num_rows;
2135: }
2136: #endif
2137: } else {
2138: /* z = A^T x + beta y
2139: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2140: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2141: */
2142: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2143: dptr = zarray;
2144: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2145: if (compressed) { /* Scatter x to work vector */
2146: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2147: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2148: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2149: VecCUDAEqualsReverse());
2150: }
2151: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2152: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2153: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2154: nx = mat->num_rows;
2155: ny = mat->num_cols;
2156: }
2157: #endif
2158: }
2160: /* csr_spmv does y = alpha op(A) x + beta y */
2161: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2162: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2163: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2164: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2165: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2166: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2167: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2168: matstruct->matDescr,
2169: matstruct->cuSpMV[opA].vecXDescr, beta,
2170: matstruct->cuSpMV[opA].vecYDescr,
2171: cusparse_scalartype,
2172: cusparsestruct->spmvAlg,
2173: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2174: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2176: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2177: } else {
2178: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2179: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2180: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2181: }
2183: stat = cusparseSpMV(cusparsestruct->handle, opA,
2184: matstruct->alpha_one,
2185: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2186: matstruct->cuSpMV[opA].vecXDescr,
2187: beta,
2188: matstruct->cuSpMV[opA].vecYDescr,
2189: cusparse_scalartype,
2190: cusparsestruct->spmvAlg,
2191: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2192: #else
2193: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2194: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2195: mat->num_rows, mat->num_cols,
2196: mat->num_entries, matstruct->alpha_one, matstruct->descr,
2197: mat->values->data().get(), mat->row_offsets->data().get(),
2198: mat->column_indices->data().get(), xptr, beta,
2199: dptr);CHKERRCUSPARSE(stat);
2200: #endif
2201: } else {
2202: if (cusparsestruct->nrows) {
2203: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2204: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2205: #else
2206: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2207: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2208: matstruct->alpha_one, matstruct->descr, hybMat,
2209: xptr, beta,
2210: dptr);CHKERRCUSPARSE(stat);
2211: #endif
2212: }
2213: }
2214: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2215: PetscLogGpuTimeEnd();
2217: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2218: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2219: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2220: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2221: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2222: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2223: }
2224: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2225: VecSet_SeqCUDA(zz,0);
2226: }
2228: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2229: if (compressed) {
2230: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2232: PetscLogGpuTimeBegin();
2233: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2234: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2235: VecCUDAPlusEquals());
2236: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2237: PetscLogGpuTimeEnd();
2238: }
2239: } else {
2240: if (yy && yy != zz) {
2241: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2242: }
2243: }
2244: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2245: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2246: else {VecCUDARestoreArrayWrite(zz,&zarray);}
2247: } catch(char *ex) {
2248: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2249: }
2250: if (yy) {
2251: PetscLogGpuFlops(2.0*a->nz);
2252: } else {
2253: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2254: }
2255: return(0);
2256: }
2258: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2259: {
2263: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2264: return(0);
2265: }
2267: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2268: {
2269: PetscErrorCode ierr;
2270: PetscSplitCSRDataStructure *d_mat = NULL, h_mat;
2271: PetscBool is_seq = PETSC_TRUE;
2272: PetscInt nnz_state = A->nonzerostate;
2274: if (A->factortype == MAT_FACTOR_NONE) {
2275: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2276: }
2277: if (d_mat) {
2278: cudaError_t err;
2279: PetscInfo(A,"Assemble device matrix\n");
2280: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2281: nnz_state = h_mat.nonzerostate;
2282: is_seq = h_mat.seq;
2283: }
2284: MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2285: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2286: if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2287: MatSeqAIJCUSPARSECopyToGPU(A);
2288: } else if (nnz_state > A->nonzerostate) {
2289: A->offloadmask = PETSC_OFFLOAD_GPU;
2290: }
2292: return(0);
2293: }
2295: /* --------------------------------------------------------------------------------*/
2296: /*@
2297: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2298: (the default parallel PETSc format). This matrix will ultimately pushed down
2299: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2300: assembly performance the user should preallocate the matrix storage by setting
2301: the parameter nz (or the array nnz). By setting these parameters accurately,
2302: performance during matrix assembly can be increased by more than a factor of 50.
2304: Collective
2306: Input Parameters:
2307: + comm - MPI communicator, set to PETSC_COMM_SELF
2308: . m - number of rows
2309: . n - number of columns
2310: . nz - number of nonzeros per row (same for all rows)
2311: - nnz - array containing the number of nonzeros in the various rows
2312: (possibly different for each row) or NULL
2314: Output Parameter:
2315: . A - the matrix
2317: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2318: MatXXXXSetPreallocation() paradgm instead of this routine directly.
2319: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2321: Notes:
2322: If nnz is given then nz is ignored
2324: The AIJ format (also called the Yale sparse matrix format or
2325: compressed row storage), is fully compatible with standard Fortran 77
2326: storage. That is, the stored row and column indices can begin at
2327: either one (as in Fortran) or zero. See the users' manual for details.
2329: Specify the preallocated storage with either nz or nnz (not both).
2330: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2331: allocation. For large problems you MUST preallocate memory or you
2332: will get TERRIBLE performance, see the users' manual chapter on matrices.
2334: By default, this format uses inodes (identical nodes) when possible, to
2335: improve numerical efficiency of matrix-vector products and solves. We
2336: search for consecutive rows with the same nonzero structure, thereby
2337: reusing matrix information to achieve increased efficiency.
2339: Level: intermediate
2341: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2342: @*/
2343: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2344: {
2348: MatCreate(comm,A);
2349: MatSetSizes(*A,m,n,m,n);
2350: MatSetType(*A,MATSEQAIJCUSPARSE);
2351: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
2352: return(0);
2353: }
2355: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2356: {
2357: PetscErrorCode ierr;
2358: PetscSplitCSRDataStructure *d_mat = NULL;
2361: if (A->factortype == MAT_FACTOR_NONE) {
2362: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2363: ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2364: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
2365: } else {
2366: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
2367: }
2368: if (d_mat) {
2369: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2370: cudaError_t err;
2371: PetscSplitCSRDataStructure h_mat;
2372: PetscInfo(A,"Have device matrix\n");
2373: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2374: if (h_mat.seq) {
2375: if (a->compressedrow.use) {
2376: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2377: }
2378: err = cudaFree(d_mat);CHKERRCUDA(err);
2379: }
2380: }
2381: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
2382: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2383: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2384: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
2385: MatDestroy_SeqAIJ(A);
2386: return(0);
2387: }
2389: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2390: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2391: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2392: {
2396: MatDuplicate_SeqAIJ(A,cpvalues,B);
2397: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
2398: return(0);
2399: }
2401: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2402: {
2403: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2407: if (A->factortype != MAT_FACTOR_NONE) return(0);
2408: /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2409: If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2410: Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2411: TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2412: can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2413: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2414: /* TODO: add support for this? */
2415: if (flg) {
2416: A->ops->mult = MatMult_SeqAIJ;
2417: A->ops->multadd = MatMultAdd_SeqAIJ;
2418: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2419: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2420: A->ops->multhermitiantranspose = NULL;
2421: A->ops->multhermitiantransposeadd = NULL;
2422: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2423: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2424: } else {
2425: A->ops->mult = MatMult_SeqAIJCUSPARSE;
2426: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
2427: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
2428: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2429: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2430: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2431: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2432: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2433: }
2434: A->boundtocpu = flg;
2435: a->inode.use = flg;
2436: return(0);
2437: }
2439: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2440: {
2441: PetscSplitCSRDataStructure *d_mat = NULL;
2442: PetscErrorCode ierr;
2444: if (A->factortype == MAT_FACTOR_NONE) {
2445: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2446: d_mat = spptr->deviceMat;
2447: }
2448: if (d_mat) {
2449: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2450: PetscInt n = A->rmap->n, nnz = a->i[n];
2451: cudaError_t err;
2452: PetscScalar *vals;
2453: PetscInfo(A,"Zero device matrix\n");
2454: err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2455: err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2456: }
2457: MatZeroEntries_SeqAIJ(A);
2459: return(0);
2460: }
2462: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2463: {
2464: PetscErrorCode ierr;
2465: cusparseStatus_t stat;
2466: Mat B;
2469: PetscCUDAInitializeCheck();
2470: if (reuse == MAT_INITIAL_MATRIX) {
2471: MatDuplicate(A,MAT_COPY_VALUES,newmat);
2472: } else if (reuse == MAT_REUSE_MATRIX) {
2473: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2474: }
2475: B = *newmat;
2477: PetscFree(B->defaultvectype);
2478: PetscStrallocpy(VECCUDA,&B->defaultvectype);
2480: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2481: if (B->factortype == MAT_FACTOR_NONE) {
2482: Mat_SeqAIJCUSPARSE *spptr;
2484: PetscNew(&spptr);
2485: spptr->format = MAT_CUSPARSE_CSR;
2486: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2487: B->spptr = spptr;
2488: spptr->deviceMat = NULL;
2489: } else {
2490: Mat_SeqAIJCUSPARSETriFactors *spptr;
2492: PetscNew(&spptr);
2493: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2494: B->spptr = spptr;
2495: }
2496: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2497: }
2498: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2499: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2500: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2501: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2502: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2503: B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
2505: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2506: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2507: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2508: return(0);
2509: }
2511: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2512: {
2516: MatCreate_SeqAIJ(B);
2517: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2518: PetscObjectOptionsBegin((PetscObject)B);
2519: MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);
2520: PetscOptionsEnd();
2521: return(0);
2522: }
2524: /*MC
2525: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2527: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2528: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2529: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2531: Options Database Keys:
2532: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2533: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2534: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2536: Level: beginner
2538: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2539: M*/
2541: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2544: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2545: {
2549: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2550: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2551: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2552: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2553: return(0);
2554: }
2556: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2557: {
2558: PetscErrorCode ierr;
2559: cusparseStatus_t stat;
2560: cusparseHandle_t handle;
2563: if (*cusparsestruct) {
2564: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2565: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2566: delete (*cusparsestruct)->workVector;
2567: delete (*cusparsestruct)->rowoffsets_gpu;
2568: if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2569: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2570: cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2571: #endif
2572: PetscFree(*cusparsestruct);
2573: }
2574: return(0);
2575: }
2577: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2578: {
2580: if (*mat) {
2581: delete (*mat)->values;
2582: delete (*mat)->column_indices;
2583: delete (*mat)->row_offsets;
2584: delete *mat;
2585: *mat = 0;
2586: }
2587: return(0);
2588: }
2590: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2591: {
2592: cusparseStatus_t stat;
2593: PetscErrorCode ierr;
2596: if (*trifactor) {
2597: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2598: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2599: CsrMatrix_Destroy(&(*trifactor)->csrMat);
2600: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2601: cudaError_t cerr;
2602: if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2603: if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2604: #endif
2605: delete *trifactor;
2606: *trifactor = 0;
2607: }
2608: return(0);
2609: }
2611: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2612: {
2613: CsrMatrix *mat;
2614: cusparseStatus_t stat;
2615: cudaError_t err;
2618: if (*matstruct) {
2619: if ((*matstruct)->mat) {
2620: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2621: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2622: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2623: #else
2624: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2625: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2626: #endif
2627: } else {
2628: mat = (CsrMatrix*)(*matstruct)->mat;
2629: CsrMatrix_Destroy(&mat);
2630: }
2631: }
2632: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2633: delete (*matstruct)->cprowIndices;
2634: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2635: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2636: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2638: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2639: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2640: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2641: for (int i=0; i<3; i++) {
2642: if (mdata->cuSpMV[i].initialized) {
2643: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2644: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2645: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2646: }
2647: }
2648: #endif
2649: delete *matstruct;
2650: *matstruct = 0;
2651: }
2652: return(0);
2653: }
2655: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2656: {
2660: if (*trifactors) {
2661: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2662: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2663: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2664: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2665: delete (*trifactors)->rpermIndices;
2666: delete (*trifactors)->cpermIndices;
2667: delete (*trifactors)->workVector;
2668: (*trifactors)->rpermIndices = 0;
2669: (*trifactors)->cpermIndices = 0;
2670: (*trifactors)->workVector = 0;
2671: }
2672: return(0);
2673: }
2675: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2676: {
2677: PetscErrorCode ierr;
2678: cusparseHandle_t handle;
2679: cusparseStatus_t stat;
2682: if (*trifactors) {
2683: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2684: if (handle = (*trifactors)->handle) {
2685: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2686: }
2687: PetscFree(*trifactors);
2688: }
2689: return(0);
2690: }