Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #include <thrust/async/for_each.h>
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/remove.h>
22: #include <thrust/sort.h>
23: #include <thrust/unique.h>
25: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30: typedef enum {
31: CUSPARSE_MV_ALG_DEFAULT = 0,
32: CUSPARSE_COOMV_ALG = 1,
33: CUSPARSE_CSRMV_ALG1 = 2,
34: CUSPARSE_CSRMV_ALG2 = 3
35: } cusparseSpMVAlg_t;
37: typedef enum {
38: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
40: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
41: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
42: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
43: CUSPARSE_SPMM_ALG_DEFAULT = 0,
44: CUSPARSE_SPMM_COO_ALG1 = 1,
45: CUSPARSE_SPMM_COO_ALG2 = 2,
46: CUSPARSE_SPMM_COO_ALG3 = 3,
47: CUSPARSE_SPMM_COO_ALG4 = 5,
48: CUSPARSE_SPMM_CSR_ALG1 = 4,
49: CUSPARSE_SPMM_CSR_ALG2 = 6,
50: } cusparseSpMMAlg_t;
52: typedef enum {
53: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
55: } cusparseCsr2CscAlg_t;
56: */
57: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60: #endif
62: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72: #endif
73: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97: {
98: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100: PetscFunctionBegin;
101: switch (op) {
102: case MAT_CUSPARSE_MULT:
103: cusparsestruct->format = format;
104: break;
105: case MAT_CUSPARSE_ALL:
106: cusparsestruct->format = format;
107: break;
108: default:
109: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110: }
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: /*@
115: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116: operation. Only the `MatMult()` operation can use different GPU storage formats
118: Not Collective
120: Input Parameters:
121: + A - Matrix of type `MATSEQAIJCUSPARSE`
122: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126: Level: intermediate
128: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129: @*/
130: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131: {
132: PetscFunctionBegin;
134: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135: PetscFunctionReturn(PETSC_SUCCESS);
136: }
138: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139: {
140: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142: PetscFunctionBegin;
143: cusparsestruct->use_cpu_solve = use_cpu;
144: PetscFunctionReturn(PETSC_SUCCESS);
145: }
147: /*@
148: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150: Input Parameters:
151: + A - Matrix of type `MATSEQAIJCUSPARSE`
152: - use_cpu - set flag for using the built-in CPU `MatSolve()`
154: Level: intermediate
156: Note:
157: The cuSparse LU solver currently computes the factors with the built-in CPU method
158: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162: @*/
163: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164: {
165: PetscFunctionBegin;
167: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168: PetscFunctionReturn(PETSC_SUCCESS);
169: }
171: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172: {
173: PetscFunctionBegin;
174: switch (op) {
175: case MAT_FORM_EXPLICIT_TRANSPOSE:
176: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178: A->form_explicit_transpose = flg;
179: break;
180: default:
181: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182: break;
183: }
184: PetscFunctionReturn(PETSC_SUCCESS);
185: }
187: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188: {
189: MatCUSPARSEStorageFormat format;
190: PetscBool flg;
191: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193: PetscFunctionBegin;
194: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195: if (A->factortype == MAT_FACTOR_NONE) {
196: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208: #else
209: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210: #endif
211: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214: PetscCall(
215: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217: #endif
218: }
219: PetscOptionsHeadEnd();
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225: {
226: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
227: PetscInt m = A->rmap->n;
228: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230: const MatScalar *Aa = a->a;
231: PetscInt *Mi, *Mj, Mnz;
232: PetscScalar *Ma;
234: PetscFunctionBegin;
235: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238: Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239: PetscCall(PetscMalloc1(m + 1, &Mi));
240: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241: PetscCall(PetscMalloc1(Mnz, &Ma));
242: Mi[0] = 0;
243: for (PetscInt i = 0; i < m; i++) {
244: PetscInt llen = Ai[i + 1] - Ai[i];
245: PetscInt ulen = Adiag[i] - Adiag[i + 1];
246: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
247: Mj[Mi[i] + llen] = i; // diagonal entry
248: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249: Mi[i + 1] = Mi[i] + llen + ulen;
250: }
251: // Copy M (L,U) from host to device
252: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
258: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
264: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
265: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271: fillMode = CUSPARSE_FILL_MODE_UPPER;
272: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277: // Allocate work vectors in SpSv
278: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
281: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292: // Record for reuse
293: fs->csrRowPtr_h = Mi;
294: fs->csrVal_h = Ma;
295: PetscCall(PetscFree(Mj));
296: }
297: // Copy the value
298: Mi = fs->csrRowPtr_h;
299: Ma = fs->csrVal_h;
300: Mnz = Mi[m];
301: for (PetscInt i = 0; i < m; i++) {
302: PetscInt llen = Ai[i + 1] - Ai[i];
303: PetscInt ulen = Adiag[i] - Adiag[i + 1];
304: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
305: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry
306: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307: }
308: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315: // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317: }
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }
320: #else
321: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322: {
323: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
324: PetscInt n = A->rmap->n;
325: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327: const PetscInt *ai = a->i, *aj = a->j, *vi;
328: const MatScalar *aa = a->a, *v;
329: PetscInt *AiLo, *AjLo;
330: PetscInt i, nz, nzLower, offset, rowOffset;
332: PetscFunctionBegin;
333: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335: try {
336: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337: nzLower = n + ai[n] - ai[1];
338: if (!loTriFactor) {
339: PetscScalar *AALo;
341: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
343: /* Allocate Space for the lower triangular matrix */
344: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
347: /* Fill the lower triangular matrix */
348: AiLo[0] = (PetscInt)0;
349: AiLo[n] = nzLower;
350: AjLo[0] = (PetscInt)0;
351: AALo[0] = (MatScalar)1.0;
352: v = aa;
353: vi = aj;
354: offset = 1;
355: rowOffset = 1;
356: for (i = 1; i < n; i++) {
357: nz = ai[i + 1] - ai[i];
358: /* additional 1 for the term on the diagonal */
359: AiLo[i] = rowOffset;
360: rowOffset += nz + 1;
362: PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363: PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
365: offset += nz;
366: AjLo[offset] = (PetscInt)i;
367: AALo[offset] = (MatScalar)1.0;
368: offset += 1;
370: v += nz;
371: vi += nz;
372: }
374: /* allocate space for the triangular factor information */
375: PetscCall(PetscNew(&loTriFactor));
376: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377: /* Create the matrix description */
378: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382: #else
383: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384: #endif
385: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388: /* set the operation */
389: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391: /* set the matrix */
392: loTriFactor->csrMat = new CsrMatrix;
393: loTriFactor->csrMat->num_rows = n;
394: loTriFactor->csrMat->num_cols = n;
395: loTriFactor->csrMat->num_entries = nzLower;
397: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406: /* Create the solve analysis information */
407: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413: #endif
415: /* perform the solve analysis */
416: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418: PetscCallCUDA(WaitForCUDA());
419: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421: /* assign the pointer */
422: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423: loTriFactor->AA_h = AALo;
424: PetscCallCUDA(cudaFreeHost(AiLo));
425: PetscCallCUDA(cudaFreeHost(AjLo));
426: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427: } else { /* update values only */
428: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429: /* Fill the lower triangular matrix */
430: loTriFactor->AA_h[0] = 1.0;
431: v = aa;
432: vi = aj;
433: offset = 1;
434: for (i = 1; i < n; i++) {
435: nz = ai[i + 1] - ai[i];
436: PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437: offset += nz;
438: loTriFactor->AA_h[offset] = 1.0;
439: offset += 1;
440: v += nz;
441: }
442: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444: }
445: } catch (char *ex) {
446: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447: }
448: }
449: PetscFunctionReturn(PETSC_SUCCESS);
450: }
452: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453: {
454: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
455: PetscInt n = A->rmap->n;
456: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
459: const MatScalar *aa = a->a, *v;
460: PetscInt *AiUp, *AjUp;
461: PetscInt i, nz, nzUpper, offset;
463: PetscFunctionBegin;
464: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466: try {
467: /* next, figure out the number of nonzeros in the upper triangular matrix. */
468: nzUpper = adiag[0] - adiag[n];
469: if (!upTriFactor) {
470: PetscScalar *AAUp;
472: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
474: /* Allocate Space for the upper triangular matrix */
475: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
478: /* Fill the upper triangular matrix */
479: AiUp[0] = (PetscInt)0;
480: AiUp[n] = nzUpper;
481: offset = nzUpper;
482: for (i = n - 1; i >= 0; i--) {
483: v = aa + adiag[i + 1] + 1;
484: vi = aj + adiag[i + 1] + 1;
486: /* number of elements NOT on the diagonal */
487: nz = adiag[i] - adiag[i + 1] - 1;
489: /* decrement the offset */
490: offset -= (nz + 1);
492: /* first, set the diagonal elements */
493: AjUp[offset] = (PetscInt)i;
494: AAUp[offset] = (MatScalar)1. / v[nz];
495: AiUp[i] = AiUp[i + 1] - (nz + 1);
497: PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498: PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499: }
501: /* allocate space for the triangular factor information */
502: PetscCall(PetscNew(&upTriFactor));
503: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505: /* Create the matrix description */
506: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510: #else
511: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512: #endif
513: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516: /* set the operation */
517: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519: /* set the matrix */
520: upTriFactor->csrMat = new CsrMatrix;
521: upTriFactor->csrMat->num_rows = n;
522: upTriFactor->csrMat->num_cols = n;
523: upTriFactor->csrMat->num_entries = nzUpper;
525: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534: /* Create the solve analysis information */
535: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541: #endif
543: /* perform the solve analysis */
544: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
547: PetscCallCUDA(WaitForCUDA());
548: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550: /* assign the pointer */
551: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552: upTriFactor->AA_h = AAUp;
553: PetscCallCUDA(cudaFreeHost(AiUp));
554: PetscCallCUDA(cudaFreeHost(AjUp));
555: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556: } else {
557: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558: /* Fill the upper triangular matrix */
559: offset = nzUpper;
560: for (i = n - 1; i >= 0; i--) {
561: v = aa + adiag[i + 1] + 1;
563: /* number of elements NOT on the diagonal */
564: nz = adiag[i] - adiag[i + 1] - 1;
566: /* decrement the offset */
567: offset -= (nz + 1);
569: /* first, set the diagonal elements */
570: upTriFactor->AA_h[offset] = 1. / v[nz];
571: PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572: }
573: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575: }
576: } catch (char *ex) {
577: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578: }
579: }
580: PetscFunctionReturn(PETSC_SUCCESS);
581: }
582: #endif
584: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585: {
586: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
587: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588: IS isrow = a->row, iscol = a->icol;
589: PetscBool row_identity, col_identity;
590: PetscInt n = A->rmap->n;
592: PetscFunctionBegin;
593: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596: #else
597: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600: #endif
602: cusparseTriFactors->nnz = a->nz;
604: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605: /* lower triangular indices */
606: PetscCall(ISIdentity(isrow, &row_identity));
607: if (!row_identity && !cusparseTriFactors->rpermIndices) {
608: const PetscInt *r;
610: PetscCall(ISGetIndices(isrow, &r));
611: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612: cusparseTriFactors->rpermIndices->assign(r, r + n);
613: PetscCall(ISRestoreIndices(isrow, &r));
614: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615: }
617: /* upper triangular indices */
618: PetscCall(ISIdentity(iscol, &col_identity));
619: if (!col_identity && !cusparseTriFactors->cpermIndices) {
620: const PetscInt *c;
622: PetscCall(ISGetIndices(iscol, &c));
623: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624: cusparseTriFactors->cpermIndices->assign(c, c + n);
625: PetscCall(ISRestoreIndices(iscol, &c));
626: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627: }
628: PetscFunctionReturn(PETSC_SUCCESS);
629: }
631: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633: {
634: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
635: PetscInt m = A->rmap->n;
636: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638: const MatScalar *Aa = a->a;
639: PetscInt *Mj, Mnz;
640: PetscScalar *Ma, *D;
642: PetscFunctionBegin;
643: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647: Mnz = Ai[m]; // Unz (with the unit diagonal)
648: PetscCall(PetscMalloc1(Mnz, &Ma));
649: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650: PetscCall(PetscMalloc1(m, &D)); // the diagonal
651: for (PetscInt i = 0; i < m; i++) {
652: PetscInt ulen = Ai[i + 1] - Ai[i];
653: Mj[Ai[i]] = i; // diagonal entry
654: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655: }
656: // Copy M (U) from host to device
657: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
670: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677: // Allocate work vectors in SpSv
678: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
681: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684: // Query buffer sizes for SpSV and then allocate buffers
685: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693: // Record for reuse
694: fs->csrVal_h = Ma;
695: fs->diag_h = D;
696: PetscCall(PetscFree(Mj));
697: }
698: // Copy the value
699: Ma = fs->csrVal_h;
700: D = fs->diag_h;
701: Mnz = Ai[m];
702: for (PetscInt i = 0; i < m; i++) {
703: D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
704: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706: }
707: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713: }
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: // Solve Ut D U x = b
718: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719: {
720: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
722: const PetscScalar *barray;
723: PetscScalar *xarray;
724: thrust::device_ptr<const PetscScalar> bGPU;
725: thrust::device_ptr<PetscScalar> xGPU;
726: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
727: PetscInt m = A->rmap->n;
729: PetscFunctionBegin;
730: PetscCall(PetscLogGpuTimeBegin());
731: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732: PetscCall(VecCUDAGetArrayRead(b, &barray));
733: xGPU = thrust::device_pointer_cast(xarray);
734: bGPU = thrust::device_pointer_cast(barray);
736: // Reorder b with the row permutation if needed, and wrap the result in fs->X
737: if (fs->rpermIndices) {
738: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740: } else {
741: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742: }
744: // Solve Ut Y = X
745: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749: // It is basically a vector element-wise multiplication, but cublas does not have it!
750: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752: // Solve U X = Y
753: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755: } else {
756: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757: }
758: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760: // Reorder X with the column permutation if needed, and put the result back to x
761: if (fs->cpermIndices) {
762: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764: }
766: PetscCall(VecCUDARestoreArrayRead(b, &barray));
767: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768: PetscCall(PetscLogGpuTimeEnd());
769: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770: PetscFunctionReturn(PETSC_SUCCESS);
771: }
772: #else
773: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774: {
775: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
776: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779: PetscInt *AiUp, *AjUp;
780: PetscScalar *AAUp;
781: PetscScalar *AALo;
782: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
784: const PetscInt *ai = b->i, *aj = b->j, *vj;
785: const MatScalar *aa = b->a, *v;
787: PetscFunctionBegin;
788: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790: try {
791: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793: if (!upTriFactor && !loTriFactor) {
794: /* Allocate Space for the upper triangular matrix */
795: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798: /* Fill the upper triangular matrix */
799: AiUp[0] = (PetscInt)0;
800: AiUp[n] = nzUpper;
801: offset = 0;
802: for (i = 0; i < n; i++) {
803: /* set the pointers */
804: v = aa + ai[i];
805: vj = aj + ai[i];
806: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808: /* first, set the diagonal elements */
809: AjUp[offset] = (PetscInt)i;
810: AAUp[offset] = (MatScalar)1.0 / v[nz];
811: AiUp[i] = offset;
812: AALo[offset] = (MatScalar)1.0 / v[nz];
814: offset += 1;
815: if (nz > 0) {
816: PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818: for (j = offset; j < offset + nz; j++) {
819: AAUp[j] = -AAUp[j];
820: AALo[j] = AAUp[j] / v[nz];
821: }
822: offset += nz;
823: }
824: }
826: /* allocate space for the triangular factor information */
827: PetscCall(PetscNew(&upTriFactor));
828: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830: /* Create the matrix description */
831: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835: #else
836: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837: #endif
838: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841: /* set the matrix */
842: upTriFactor->csrMat = new CsrMatrix;
843: upTriFactor->csrMat->num_rows = A->rmap->n;
844: upTriFactor->csrMat->num_cols = A->cmap->n;
845: upTriFactor->csrMat->num_entries = a->nz;
847: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856: /* set the operation */
857: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859: /* Create the solve analysis information */
860: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866: #endif
868: /* perform the solve analysis */
869: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
872: PetscCallCUDA(WaitForCUDA());
873: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875: /* assign the pointer */
876: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878: /* allocate space for the triangular factor information */
879: PetscCall(PetscNew(&loTriFactor));
880: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882: /* Create the matrix description */
883: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887: #else
888: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889: #endif
890: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893: /* set the operation */
894: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896: /* set the matrix */
897: loTriFactor->csrMat = new CsrMatrix;
898: loTriFactor->csrMat->num_rows = A->rmap->n;
899: loTriFactor->csrMat->num_cols = A->cmap->n;
900: loTriFactor->csrMat->num_entries = a->nz;
902: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911: /* Create the solve analysis information */
912: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918: #endif
920: /* perform the solve analysis */
921: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
924: PetscCallCUDA(WaitForCUDA());
925: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927: /* assign the pointer */
928: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931: PetscCallCUDA(cudaFreeHost(AiUp));
932: PetscCallCUDA(cudaFreeHost(AjUp));
933: } else {
934: /* Fill the upper triangular matrix */
935: offset = 0;
936: for (i = 0; i < n; i++) {
937: /* set the pointers */
938: v = aa + ai[i];
939: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941: /* first, set the diagonal elements */
942: AAUp[offset] = 1.0 / v[nz];
943: AALo[offset] = 1.0 / v[nz];
945: offset += 1;
946: if (nz > 0) {
947: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948: for (j = offset; j < offset + nz; j++) {
949: AAUp[j] = -AAUp[j];
950: AALo[j] = AAUp[j] / v[nz];
951: }
952: offset += nz;
953: }
954: }
955: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960: }
961: PetscCallCUDA(cudaFreeHost(AAUp));
962: PetscCallCUDA(cudaFreeHost(AALo));
963: } catch (char *ex) {
964: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965: }
966: }
967: PetscFunctionReturn(PETSC_SUCCESS);
968: }
969: #endif
971: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972: {
973: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
974: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975: IS ip = a->row;
976: PetscBool perm_identity;
977: PetscInt n = A->rmap->n;
979: PetscFunctionBegin;
980: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984: #else
985: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987: #endif
988: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990: A->offloadmask = PETSC_OFFLOAD_BOTH;
992: /* lower triangular indices */
993: PetscCall(ISIdentity(ip, &perm_identity));
994: if (!perm_identity) {
995: IS iip;
996: const PetscInt *irip, *rip;
998: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999: PetscCall(ISGetIndices(iip, &irip));
1000: PetscCall(ISGetIndices(ip, &rip));
1001: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005: PetscCall(ISRestoreIndices(iip, &irip));
1006: PetscCall(ISDestroy(&iip));
1007: PetscCall(ISRestoreIndices(ip, &rip));
1008: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009: }
1010: PetscFunctionReturn(PETSC_SUCCESS);
1011: }
1013: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014: {
1015: PetscFunctionBegin;
1016: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018: B->offloadmask = PETSC_OFFLOAD_CPU;
1020: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023: #else
1024: /* determine which version of MatSolve needs to be used. */
1025: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1026: IS ip = b->row;
1027: PetscBool perm_identity;
1029: PetscCall(ISIdentity(ip, &perm_identity));
1030: if (perm_identity) {
1031: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033: } else {
1034: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1035: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036: }
1037: #endif
1038: B->ops->matsolve = NULL;
1039: B->ops->matsolvetranspose = NULL;
1041: /* get the triangular factors */
1042: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048: {
1049: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054: cusparseIndexBase_t indexBase;
1055: cusparseMatrixType_t matrixType;
1056: cusparseFillMode_t fillMode;
1057: cusparseDiagType_t diagType;
1059: PetscFunctionBegin;
1060: /* allocate space for the transpose of the lower triangular factor */
1061: PetscCall(PetscNew(&loTriFactorT));
1062: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064: /* set the matrix descriptors of the lower triangular factor */
1065: matrixType = cusparseGetMatType(loTriFactor->descr);
1066: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1067: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1070: /* Create the matrix description */
1071: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077: /* set the operation */
1078: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080: /* allocate GPU space for the CSC of the lower triangular factor*/
1081: loTriFactorT->csrMat = new CsrMatrix;
1082: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1083: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1084: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1085: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095: #endif
1097: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098: {
1099: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104: #else
1105: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106: #endif
1107: PetscCallCUSPARSE(stat);
1108: }
1110: PetscCallCUDA(WaitForCUDA());
1111: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113: /* Create the solve analysis information */
1114: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120: #endif
1122: /* perform the solve analysis */
1123: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1126: PetscCallCUDA(WaitForCUDA());
1127: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129: /* assign the pointer */
1130: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132: /*********************************************/
1133: /* Now the Transpose of the Upper Tri Factor */
1134: /*********************************************/
1136: /* allocate space for the transpose of the upper triangular factor */
1137: PetscCall(PetscNew(&upTriFactorT));
1138: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140: /* set the matrix descriptors of the upper triangular factor */
1141: matrixType = cusparseGetMatType(upTriFactor->descr);
1142: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1143: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1146: /* Create the matrix description */
1147: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153: /* set the operation */
1154: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156: /* allocate GPU space for the CSC of the upper triangular factor*/
1157: upTriFactorT->csrMat = new CsrMatrix;
1158: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1159: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1160: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1161: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171: #endif
1173: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174: {
1175: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180: #else
1181: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182: #endif
1183: PetscCallCUSPARSE(stat);
1184: }
1186: PetscCallCUDA(WaitForCUDA());
1187: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189: /* Create the solve analysis information */
1190: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196: #endif
1198: /* perform the solve analysis */
1199: /* christ, would it have killed you to put this stuff in a function????????? */
1200: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203: PetscCallCUDA(WaitForCUDA());
1204: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206: /* assign the pointer */
1207: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208: PetscFunctionReturn(PETSC_SUCCESS);
1209: }
1210: #endif
1212: struct PetscScalarToPetscInt {
1213: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214: };
1216: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217: {
1218: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1221: cusparseStatus_t stat;
1222: cusparseIndexBase_t indexBase;
1224: PetscFunctionBegin;
1225: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232: PetscCall(PetscLogGpuTimeBegin());
1233: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241: /* set alpha and beta */
1242: PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243: PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244: PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250: CsrMatrix *matrixT = new CsrMatrix;
1251: matstructT->mat = matrixT;
1252: matrixT->num_rows = A->cmap->n;
1253: matrixT->num_cols = A->rmap->n;
1254: matrixT->num_entries = a->nz;
1255: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257: matrixT->values = new THRUSTARRAY(a->nz);
1259: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265: indexBase, cusparse_scalartype);
1266: PetscCallCUSPARSE(stat);
1267: #else
1268: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1271: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274: */
1275: if (matrixT->num_entries) {
1276: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277: PetscCallCUSPARSE(stat);
1279: } else {
1280: matstructT->matDescr = NULL;
1281: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282: }
1283: #endif
1284: #endif
1285: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288: #else
1289: CsrMatrix *temp = new CsrMatrix;
1290: CsrMatrix *tempT = new CsrMatrix;
1291: /* First convert HYB to CSR */
1292: temp->num_rows = A->rmap->n;
1293: temp->num_cols = A->cmap->n;
1294: temp->num_entries = a->nz;
1295: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1296: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297: temp->values = new THRUSTARRAY(a->nz);
1299: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300: PetscCallCUSPARSE(stat);
1302: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303: tempT->num_rows = A->rmap->n;
1304: tempT->num_cols = A->cmap->n;
1305: tempT->num_entries = a->nz;
1306: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1307: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308: tempT->values = new THRUSTARRAY(a->nz);
1310: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312: PetscCallCUSPARSE(stat);
1314: /* Last, convert CSC to HYB */
1315: cusparseHybMat_t hybMat;
1316: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319: PetscCallCUSPARSE(stat);
1321: /* assign the pointer */
1322: matstructT->mat = hybMat;
1323: A->transupdated = PETSC_TRUE;
1324: /* delete temporaries */
1325: if (tempT) {
1326: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329: delete (CsrMatrix *)tempT;
1330: }
1331: if (temp) {
1332: if (temp->values) delete (THRUSTARRAY *)temp->values;
1333: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335: delete (CsrMatrix *)temp;
1336: }
1337: #endif
1338: }
1339: }
1340: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1342: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355: }
1356: if (!cusparsestruct->csr2csc_i) {
1357: THRUSTARRAY csr2csc_a(matrix->num_entries);
1358: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362: void *csr2cscBuffer;
1363: size_t csr2cscBufferSize;
1364: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366: PetscCallCUSPARSE(stat);
1367: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368: #endif
1370: if (matrix->num_entries) {
1371: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376: should be filled with indexBase. So I just take a shortcut here.
1377: */
1378: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381: PetscCallCUSPARSE(stat);
1382: #else
1383: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384: PetscCallCUSPARSE(stat);
1385: #endif
1386: } else {
1387: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388: }
1390: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393: PetscCallCUDA(cudaFree(csr2cscBuffer));
1394: #endif
1395: }
1396: PetscCallThrust(
1397: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398: }
1399: PetscCall(PetscLogGpuTimeEnd());
1400: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401: /* the compressed row indices is not used for matTranspose */
1402: matstructT->cprowIndices = NULL;
1403: /* assign the pointer */
1404: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405: A->transupdated = PETSC_TRUE;
1406: PetscFunctionReturn(PETSC_SUCCESS);
1407: }
1409: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411: {
1412: const PetscScalar *barray;
1413: PetscScalar *xarray;
1414: thrust::device_ptr<const PetscScalar> bGPU;
1415: thrust::device_ptr<PetscScalar> xGPU;
1416: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420: PetscInt m = A->rmap->n;
1422: PetscFunctionBegin;
1423: PetscCall(PetscLogGpuTimeBegin());
1424: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425: PetscCall(VecCUDAGetArrayRead(b, &barray));
1426: xGPU = thrust::device_pointer_cast(xarray);
1427: bGPU = thrust::device_pointer_cast(barray);
1429: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430: if (fs->rpermIndices) {
1431: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433: } else {
1434: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435: }
1437: // Solve L Y = X
1438: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442: // Solve U X = Y
1443: if (fs->cpermIndices) {
1444: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445: } else {
1446: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447: }
1448: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450: // Reorder X with the column permutation if needed, and put the result back to x
1451: if (fs->cpermIndices) {
1452: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454: }
1455: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457: PetscCall(PetscLogGpuTimeEnd());
1458: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459: PetscFunctionReturn(PETSC_SUCCESS);
1460: }
1462: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463: {
1464: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466: const PetscScalar *barray;
1467: PetscScalar *xarray;
1468: thrust::device_ptr<const PetscScalar> bGPU;
1469: thrust::device_ptr<PetscScalar> xGPU;
1470: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1471: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472: PetscInt m = A->rmap->n;
1474: PetscFunctionBegin;
1475: PetscCall(PetscLogGpuTimeBegin());
1476: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486: }
1488: if (!fs->updatedTransposeSpSVAnalysis) {
1489: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493: }
1495: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496: PetscCall(VecCUDAGetArrayRead(b, &barray));
1497: xGPU = thrust::device_pointer_cast(xarray);
1498: bGPU = thrust::device_pointer_cast(barray);
1500: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501: if (fs->rpermIndices) {
1502: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504: } else {
1505: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506: }
1508: // Solve Ut Y = X
1509: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512: // Solve Lt X = Y
1513: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515: } else {
1516: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517: }
1518: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520: // Reorder X with the column permutation if needed, and put the result back to x
1521: if (fs->cpermIndices) {
1522: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524: }
1526: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528: PetscCall(PetscLogGpuTimeEnd());
1529: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530: PetscFunctionReturn(PETSC_SUCCESS);
1531: }
1532: #else
1533: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535: {
1536: PetscInt n = xx->map->n;
1537: const PetscScalar *barray;
1538: PetscScalar *xarray;
1539: thrust::device_ptr<const PetscScalar> bGPU;
1540: thrust::device_ptr<PetscScalar> xGPU;
1541: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546: PetscFunctionBegin;
1547: /* Analyze the matrix and create the transpose ... on the fly */
1548: if (!loTriFactorT && !upTriFactorT) {
1549: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552: }
1554: /* Get the GPU pointers */
1555: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557: xGPU = thrust::device_pointer_cast(xarray);
1558: bGPU = thrust::device_pointer_cast(barray);
1560: PetscCall(PetscLogGpuTimeBegin());
1561: /* First, reorder with the row permutation */
1562: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564: /* First, solve U */
1565: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568: /* Then, solve L */
1569: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575: /* Copy the temporary to the full solution. */
1576: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578: /* restore */
1579: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581: PetscCall(PetscLogGpuTimeEnd());
1582: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587: {
1588: const PetscScalar *barray;
1589: PetscScalar *xarray;
1590: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595: PetscFunctionBegin;
1596: /* Analyze the matrix and create the transpose ... on the fly */
1597: if (!loTriFactorT && !upTriFactorT) {
1598: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601: }
1603: /* Get the GPU pointers */
1604: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607: PetscCall(PetscLogGpuTimeBegin());
1608: /* First, solve U */
1609: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612: /* Then, solve L */
1613: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616: /* restore */
1617: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619: PetscCall(PetscLogGpuTimeEnd());
1620: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621: PetscFunctionReturn(PETSC_SUCCESS);
1622: }
1624: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625: {
1626: const PetscScalar *barray;
1627: PetscScalar *xarray;
1628: thrust::device_ptr<const PetscScalar> bGPU;
1629: thrust::device_ptr<PetscScalar> xGPU;
1630: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1635: PetscFunctionBegin;
1636: /* Get the GPU pointers */
1637: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639: xGPU = thrust::device_pointer_cast(xarray);
1640: bGPU = thrust::device_pointer_cast(barray);
1642: PetscCall(PetscLogGpuTimeBegin());
1643: /* First, reorder with the row permutation */
1644: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646: /* Next, solve L */
1647: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650: /* Then, solve U */
1651: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654: /* Last, reorder with the column permutation */
1655: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1657: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659: PetscCall(PetscLogGpuTimeEnd());
1660: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661: PetscFunctionReturn(PETSC_SUCCESS);
1662: }
1664: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665: {
1666: const PetscScalar *barray;
1667: PetscScalar *xarray;
1668: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1673: PetscFunctionBegin;
1674: /* Get the GPU pointers */
1675: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1678: PetscCall(PetscLogGpuTimeBegin());
1679: /* First, solve L */
1680: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683: /* Next, solve U */
1684: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1687: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689: PetscCall(PetscLogGpuTimeEnd());
1690: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691: PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1693: #endif
1695: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697: {
1698: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1700: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701: CsrMatrix *Acsr;
1702: PetscInt m, nz;
1703: PetscBool flg;
1705: PetscFunctionBegin;
1706: if (PetscDefined(USE_DEBUG)) {
1707: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709: }
1711: /* Copy A's value to fact */
1712: m = fact->rmap->n;
1713: nz = aij->nz;
1714: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715: Acsr = (CsrMatrix *)Acusp->mat->mat;
1716: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718: /* Factorize fact inplace */
1719: if (m)
1720: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722: if (PetscDefined(USE_DEBUG)) {
1723: int numerical_zero;
1724: cusparseStatus_t status;
1725: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727: }
1729: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731: */
1732: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1734: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1736: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1739: fact->offloadmask = PETSC_OFFLOAD_GPU;
1740: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742: fact->ops->matsolve = NULL;
1743: fact->ops->matsolvetranspose = NULL;
1744: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745: PetscFunctionReturn(PETSC_SUCCESS);
1746: }
1748: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749: {
1750: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1752: PetscInt m, nz;
1754: PetscFunctionBegin;
1755: if (PetscDefined(USE_DEBUG)) {
1756: PetscInt i;
1757: PetscBool flg, missing;
1759: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762: PetscCall(MatMissingDiagonal(A, &missing, &i));
1763: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764: }
1766: /* Free the old stale stuff */
1767: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1769: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770: but they will not be used. Allocate them just for easy debugging.
1771: */
1772: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1774: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1775: fact->factortype = MAT_FACTOR_ILU;
1776: fact->info.factor_mallocs = 0;
1777: fact->info.fill_ratio_given = info->fill;
1778: fact->info.fill_ratio_needed = 1.0;
1780: aij->row = NULL;
1781: aij->col = NULL;
1783: /* ====================================================================== */
1784: /* Copy A's i, j to fact and also allocate the value array of fact. */
1785: /* We'll do in-place factorization on fact */
1786: /* ====================================================================== */
1787: const int *Ai, *Aj;
1789: m = fact->rmap->n;
1790: nz = aij->nz;
1792: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1796: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1799: /* ====================================================================== */
1800: /* Create descriptors for M, L, U */
1801: /* ====================================================================== */
1802: cusparseFillMode_t fillMode;
1803: cusparseDiagType_t diagType;
1805: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1809: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814: */
1815: fillMode = CUSPARSE_FILL_MODE_LOWER;
1816: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1821: fillMode = CUSPARSE_FILL_MODE_UPPER;
1822: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1827: /* ========================================================================= */
1828: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1829: /* ========================================================================= */
1830: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831: if (m)
1832: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1835: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1838: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1841: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1844: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1847: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851: */
1852: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854: fs->spsvBuffer_L = fs->factBuffer_M;
1855: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856: } else {
1857: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858: fs->spsvBuffer_U = fs->factBuffer_M;
1859: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860: }
1862: /* ========================================================================== */
1863: /* Perform analysis of ilu0 on M, SpSv on L and U */
1864: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865: /* ========================================================================== */
1866: int structural_zero;
1867: cusparseStatus_t status;
1869: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870: if (m)
1871: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873: if (PetscDefined(USE_DEBUG)) {
1874: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877: }
1879: /* Estimate FLOPs of the numeric factorization */
1880: {
1881: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1882: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1883: PetscLogDouble flops = 0.0;
1885: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886: Ai = Aseq->i;
1887: Adiag = Aseq->diag;
1888: for (PetscInt i = 0; i < m; i++) {
1889: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890: nzRow = Ai[i + 1] - Ai[i];
1891: nzLeft = Adiag[i] - Ai[i];
1892: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894: */
1895: nzLeft = (nzRow - 1) / 2;
1896: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897: }
1898: }
1899: fs->numericFactFlops = flops;
1900: }
1901: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902: PetscFunctionReturn(PETSC_SUCCESS);
1903: }
1905: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906: {
1907: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1909: const PetscScalar *barray;
1910: PetscScalar *xarray;
1912: PetscFunctionBegin;
1913: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914: PetscCall(VecCUDAGetArrayRead(b, &barray));
1915: PetscCall(PetscLogGpuTimeBegin());
1917: /* Solve L*y = b */
1918: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1923: /* Solve Lt*x = y */
1924: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1928: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1931: PetscCall(PetscLogGpuTimeEnd());
1932: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933: PetscFunctionReturn(PETSC_SUCCESS);
1934: }
1936: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937: {
1938: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1940: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941: CsrMatrix *Acsr;
1942: PetscInt m, nz;
1943: PetscBool flg;
1945: PetscFunctionBegin;
1946: if (PetscDefined(USE_DEBUG)) {
1947: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949: }
1951: /* Copy A's value to fact */
1952: m = fact->rmap->n;
1953: nz = aij->nz;
1954: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955: Acsr = (CsrMatrix *)Acusp->mat->mat;
1956: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1958: /* Factorize fact inplace */
1959: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964: */
1965: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966: if (PetscDefined(USE_DEBUG)) {
1967: int numerical_zero;
1968: cusparseStatus_t status;
1969: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971: }
1973: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1975: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977: */
1978: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1980: fact->offloadmask = PETSC_OFFLOAD_GPU;
1981: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1982: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1983: fact->ops->matsolve = NULL;
1984: fact->ops->matsolvetranspose = NULL;
1985: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986: PetscFunctionReturn(PETSC_SUCCESS);
1987: }
1989: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990: {
1991: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1993: PetscInt m, nz;
1995: PetscFunctionBegin;
1996: if (PetscDefined(USE_DEBUG)) {
1997: PetscInt i;
1998: PetscBool flg, missing;
2000: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003: PetscCall(MatMissingDiagonal(A, &missing, &i));
2004: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005: }
2007: /* Free the old stale stuff */
2008: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2010: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011: but they will not be used. Allocate them just for easy debugging.
2012: */
2013: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2015: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2016: fact->factortype = MAT_FACTOR_ICC;
2017: fact->info.factor_mallocs = 0;
2018: fact->info.fill_ratio_given = info->fill;
2019: fact->info.fill_ratio_needed = 1.0;
2021: aij->row = NULL;
2022: aij->col = NULL;
2024: /* ====================================================================== */
2025: /* Copy A's i, j to fact and also allocate the value array of fact. */
2026: /* We'll do in-place factorization on fact */
2027: /* ====================================================================== */
2028: const int *Ai, *Aj;
2030: m = fact->rmap->n;
2031: nz = aij->nz;
2033: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2040: /* ====================================================================== */
2041: /* Create mat descriptors for M, L */
2042: /* ====================================================================== */
2043: cusparseFillMode_t fillMode;
2044: cusparseDiagType_t diagType;
2046: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2050: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055: */
2056: fillMode = CUSPARSE_FILL_MODE_LOWER;
2057: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2062: /* ========================================================================= */
2063: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2064: /* ========================================================================= */
2065: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2068: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2071: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2074: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2077: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2080: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082: */
2083: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085: fs->spsvBuffer_L = fs->factBuffer_M;
2086: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087: } else {
2088: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089: fs->spsvBuffer_Lt = fs->factBuffer_M;
2090: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091: }
2093: /* ========================================================================== */
2094: /* Perform analysis of ic0 on M */
2095: /* The lower triangular part of M has the same sparsity pattern as L */
2096: /* ========================================================================== */
2097: int structural_zero;
2098: cusparseStatus_t status;
2100: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102: if (PetscDefined(USE_DEBUG)) {
2103: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106: }
2108: /* Estimate FLOPs of the numeric factorization */
2109: {
2110: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2111: PetscInt *Ai, nzRow, nzLeft;
2112: PetscLogDouble flops = 0.0;
2114: Ai = Aseq->i;
2115: for (PetscInt i = 0; i < m; i++) {
2116: nzRow = Ai[i + 1] - Ai[i];
2117: if (nzRow > 1) {
2118: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120: */
2121: nzLeft = (nzRow - 1) / 2;
2122: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123: }
2124: }
2125: fs->numericFactFlops = flops;
2126: }
2127: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128: PetscFunctionReturn(PETSC_SUCCESS);
2129: }
2130: #endif
2132: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133: {
2134: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2137: PetscFunctionBegin;
2138: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140: B->offloadmask = PETSC_OFFLOAD_CPU;
2142: if (!cusparsestruct->use_cpu_solve) {
2143: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2145: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146: #else
2147: /* determine which version of MatSolve needs to be used. */
2148: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2149: IS isrow = b->row, iscol = b->col;
2150: PetscBool row_identity, col_identity;
2152: PetscCall(ISIdentity(isrow, &row_identity));
2153: PetscCall(ISIdentity(iscol, &col_identity));
2154: if (row_identity && col_identity) {
2155: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157: } else {
2158: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2159: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160: }
2161: #endif
2162: }
2163: B->ops->matsolve = NULL;
2164: B->ops->matsolvetranspose = NULL;
2166: /* get the triangular factors */
2167: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168: PetscFunctionReturn(PETSC_SUCCESS);
2169: }
2171: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172: {
2173: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2175: PetscFunctionBegin;
2176: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179: PetscFunctionReturn(PETSC_SUCCESS);
2180: }
2182: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183: {
2184: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2186: PetscFunctionBegin;
2187: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189: if (cusparseTriFactors->factorizeOnDevice) {
2190: PetscCall(ISIdentity(isrow, &row_identity));
2191: PetscCall(ISIdentity(iscol, &col_identity));
2192: }
2193: if (!info->levels && row_identity && col_identity) {
2194: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195: } else
2196: #endif
2197: {
2198: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201: }
2202: PetscFunctionReturn(PETSC_SUCCESS);
2203: }
2205: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206: {
2207: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2209: PetscFunctionBegin;
2210: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211: PetscBool perm_identity = PETSC_FALSE;
2212: if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213: if (!info->levels && perm_identity) {
2214: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215: } else
2216: #endif
2217: {
2218: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221: }
2222: PetscFunctionReturn(PETSC_SUCCESS);
2223: }
2225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226: {
2227: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2229: PetscFunctionBegin;
2230: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233: PetscFunctionReturn(PETSC_SUCCESS);
2234: }
2236: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237: {
2238: PetscFunctionBegin;
2239: *type = MATSOLVERCUSPARSE;
2240: PetscFunctionReturn(PETSC_SUCCESS);
2241: }
2243: /*MC
2244: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249: algorithms are not recommended. This class does NOT support direct solver operations.
2251: Level: beginner
2253: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255: M*/
2257: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258: {
2259: PetscInt n = A->rmap->n;
2260: PetscBool factOnDevice, factOnHost;
2261: char *prefix;
2262: char factPlace[32] = "device"; /* the default */
2264: PetscFunctionBegin;
2265: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266: PetscCall(MatSetSizes(*B, n, n, n, n));
2267: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2270: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271: PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272: PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273: PetscOptionsEnd();
2274: PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275: PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276: PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277: ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2279: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282: if (!A->boundtocpu) {
2283: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285: } else {
2286: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2288: }
2289: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293: if (!A->boundtocpu) {
2294: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296: } else {
2297: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2298: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299: }
2300: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2304: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305: (*B)->canuseordering = PETSC_TRUE;
2306: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307: PetscFunctionReturn(PETSC_SUCCESS);
2308: }
2310: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311: {
2312: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2313: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316: #endif
2318: PetscFunctionBegin;
2319: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321: if (A->factortype == MAT_FACTOR_NONE) {
2322: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324: }
2325: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326: else if (fs->csrVal) {
2327: /* We have a factorized matrix on device and are able to copy it to host */
2328: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329: }
2330: #endif
2331: else
2332: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335: A->offloadmask = PETSC_OFFLOAD_BOTH;
2336: }
2337: PetscFunctionReturn(PETSC_SUCCESS);
2338: }
2340: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341: {
2342: PetscFunctionBegin;
2343: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344: *array = ((Mat_SeqAIJ *)A->data)->a;
2345: PetscFunctionReturn(PETSC_SUCCESS);
2346: }
2348: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349: {
2350: PetscFunctionBegin;
2351: A->offloadmask = PETSC_OFFLOAD_CPU;
2352: *array = NULL;
2353: PetscFunctionReturn(PETSC_SUCCESS);
2354: }
2356: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357: {
2358: PetscFunctionBegin;
2359: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360: *array = ((Mat_SeqAIJ *)A->data)->a;
2361: PetscFunctionReturn(PETSC_SUCCESS);
2362: }
2364: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365: {
2366: PetscFunctionBegin;
2367: *array = NULL;
2368: PetscFunctionReturn(PETSC_SUCCESS);
2369: }
2371: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372: {
2373: PetscFunctionBegin;
2374: *array = ((Mat_SeqAIJ *)A->data)->a;
2375: PetscFunctionReturn(PETSC_SUCCESS);
2376: }
2378: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379: {
2380: PetscFunctionBegin;
2381: A->offloadmask = PETSC_OFFLOAD_CPU;
2382: *array = NULL;
2383: PetscFunctionReturn(PETSC_SUCCESS);
2384: }
2386: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387: {
2388: Mat_SeqAIJCUSPARSE *cusp;
2389: CsrMatrix *matrix;
2391: PetscFunctionBegin;
2392: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396: matrix = (CsrMatrix *)cusp->mat->mat;
2398: if (i) {
2399: #if !defined(PETSC_USE_64BIT_INDICES)
2400: *i = matrix->row_offsets->data().get();
2401: #else
2402: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403: #endif
2404: }
2405: if (j) {
2406: #if !defined(PETSC_USE_64BIT_INDICES)
2407: *j = matrix->column_indices->data().get();
2408: #else
2409: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410: #endif
2411: }
2412: if (a) *a = matrix->values->data().get();
2413: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414: PetscFunctionReturn(PETSC_SUCCESS);
2415: }
2417: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418: {
2419: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2421: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2422: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2423: cusparseStatus_t stat;
2424: PetscBool both = PETSC_TRUE;
2426: PetscFunctionBegin;
2427: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430: CsrMatrix *matrix;
2431: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2433: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435: matrix->values->assign(a->a, a->a + a->nz);
2436: PetscCallCUDA(WaitForCUDA());
2437: PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440: } else {
2441: PetscInt nnz;
2442: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445: delete cusparsestruct->workVector;
2446: delete cusparsestruct->rowoffsets_gpu;
2447: cusparsestruct->workVector = NULL;
2448: cusparsestruct->rowoffsets_gpu = NULL;
2449: try {
2450: if (a->compressedrow.use) {
2451: m = a->compressedrow.nrows;
2452: ii = a->compressedrow.i;
2453: ridx = a->compressedrow.rindex;
2454: } else {
2455: m = A->rmap->n;
2456: ii = a->i;
2457: ridx = NULL;
2458: }
2459: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460: if (!a->a) {
2461: nnz = ii[m];
2462: both = PETSC_FALSE;
2463: } else nnz = a->nz;
2464: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2466: /* create cusparse matrix */
2467: cusparsestruct->nrows = m;
2468: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2469: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2473: PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474: PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475: PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2481: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483: /* set the matrix */
2484: CsrMatrix *mat = new CsrMatrix;
2485: mat->num_rows = m;
2486: mat->num_cols = A->cmap->n;
2487: mat->num_entries = nnz;
2488: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489: mat->row_offsets->assign(ii, ii + m + 1);
2491: mat->column_indices = new THRUSTINTARRAY32(nnz);
2492: mat->column_indices->assign(a->j, a->j + nnz);
2494: mat->values = new THRUSTARRAY(nnz);
2495: if (a->a) mat->values->assign(a->a, a->a + nnz);
2497: /* assign the pointer */
2498: matstruct->mat = mat;
2499: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503: PetscCallCUSPARSE(stat);
2504: }
2505: #endif
2506: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509: #else
2510: CsrMatrix *mat = new CsrMatrix;
2511: mat->num_rows = m;
2512: mat->num_cols = A->cmap->n;
2513: mat->num_entries = nnz;
2514: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515: mat->row_offsets->assign(ii, ii + m + 1);
2517: mat->column_indices = new THRUSTINTARRAY32(nnz);
2518: mat->column_indices->assign(a->j, a->j + nnz);
2520: mat->values = new THRUSTARRAY(nnz);
2521: if (a->a) mat->values->assign(a->a, a->a + nnz);
2523: cusparseHybMat_t hybMat;
2524: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527: PetscCallCUSPARSE(stat);
2528: /* assign the pointer */
2529: matstruct->mat = hybMat;
2531: if (mat) {
2532: if (mat->values) delete (THRUSTARRAY *)mat->values;
2533: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535: delete (CsrMatrix *)mat;
2536: }
2537: #endif
2538: }
2540: /* assign the compressed row indices */
2541: if (a->compressedrow.use) {
2542: cusparsestruct->workVector = new THRUSTARRAY(m);
2543: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2544: matstruct->cprowIndices->assign(ridx, ridx + m);
2545: tmp = m;
2546: } else {
2547: cusparsestruct->workVector = NULL;
2548: matstruct->cprowIndices = NULL;
2549: tmp = 0;
2550: }
2551: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2553: /* assign the pointer */
2554: cusparsestruct->mat = matstruct;
2555: } catch (char *ex) {
2556: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557: }
2558: PetscCallCUDA(WaitForCUDA());
2559: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560: cusparsestruct->nonzerostate = A->nonzerostate;
2561: }
2562: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563: }
2564: PetscFunctionReturn(PETSC_SUCCESS);
2565: }
2567: struct VecCUDAPlusEquals {
2568: template <typename Tuple>
2569: __host__ __device__ void operator()(Tuple t)
2570: {
2571: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572: }
2573: };
2575: struct VecCUDAEquals {
2576: template <typename Tuple>
2577: __host__ __device__ void operator()(Tuple t)
2578: {
2579: thrust::get<1>(t) = thrust::get<0>(t);
2580: }
2581: };
2583: struct VecCUDAEqualsReverse {
2584: template <typename Tuple>
2585: __host__ __device__ void operator()(Tuple t)
2586: {
2587: thrust::get<0>(t) = thrust::get<1>(t);
2588: }
2589: };
2591: struct MatMatCusparse {
2592: PetscBool cisdense;
2593: PetscScalar *Bt;
2594: Mat X;
2595: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596: PetscLogDouble flops;
2597: CsrMatrix *Bcsr;
2599: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600: cusparseSpMatDescr_t matSpBDescr;
2601: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2602: cusparseDnMatDescr_t matBDescr;
2603: cusparseDnMatDescr_t matCDescr;
2604: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606: void *dBuffer4;
2607: void *dBuffer5;
2608: #endif
2609: size_t mmBufferSize;
2610: void *mmBuffer;
2611: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612: cusparseSpGEMMDescr_t spgemmDesc;
2613: #endif
2614: };
2616: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617: {
2618: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2620: PetscFunctionBegin;
2621: PetscCallCUDA(cudaFree(mmdata->Bt));
2622: delete mmdata->Bcsr;
2623: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631: #endif
2632: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634: #endif
2635: PetscCall(MatDestroy(&mmdata->X));
2636: PetscCall(PetscFree(data));
2637: PetscFunctionReturn(PETSC_SUCCESS);
2638: }
2640: #include <../src/mat/impls/dense/seq/dense.h>
2642: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643: {
2644: Mat_Product *product = C->product;
2645: Mat A, B;
2646: PetscInt m, n, blda, clda;
2647: PetscBool flg, biscuda;
2648: Mat_SeqAIJCUSPARSE *cusp;
2649: cusparseStatus_t stat;
2650: cusparseOperation_t opA;
2651: const PetscScalar *barray;
2652: PetscScalar *carray;
2653: MatMatCusparse *mmdata;
2654: Mat_SeqAIJCUSPARSEMultStruct *mat;
2655: CsrMatrix *csrmat;
2657: PetscFunctionBegin;
2658: MatCheckProduct(C, 1);
2659: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660: mmdata = (MatMatCusparse *)product->data;
2661: A = product->A;
2662: B = product->B;
2663: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666: Instead of silently accepting the wrong answer, I prefer to raise the error */
2667: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670: switch (product->type) {
2671: case MATPRODUCT_AB:
2672: case MATPRODUCT_PtAP:
2673: mat = cusp->mat;
2674: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675: m = A->rmap->n;
2676: n = B->cmap->n;
2677: break;
2678: case MATPRODUCT_AtB:
2679: if (!A->form_explicit_transpose) {
2680: mat = cusp->mat;
2681: opA = CUSPARSE_OPERATION_TRANSPOSE;
2682: } else {
2683: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684: mat = cusp->matTranspose;
2685: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686: }
2687: m = A->cmap->n;
2688: n = B->cmap->n;
2689: break;
2690: case MATPRODUCT_ABt:
2691: case MATPRODUCT_RARt:
2692: mat = cusp->mat;
2693: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694: m = A->rmap->n;
2695: n = B->rmap->n;
2696: break;
2697: default:
2698: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699: }
2700: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701: csrmat = (CsrMatrix *)mat->mat;
2702: /* if the user passed a CPU matrix, copy the data to the GPU */
2703: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2707: PetscCall(MatDenseGetLDA(B, &blda));
2708: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711: } else {
2712: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713: PetscCall(MatDenseGetLDA(C, &clda));
2714: }
2716: PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721: size_t mmBufferSize;
2722: if (mmdata->initialized && mmdata->Blda != blda) {
2723: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724: mmdata->matBDescr = NULL;
2725: }
2726: if (!mmdata->matBDescr) {
2727: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728: mmdata->Blda = blda;
2729: }
2731: if (mmdata->initialized && mmdata->Clda != clda) {
2732: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733: mmdata->matCDescr = NULL;
2734: }
2735: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737: mmdata->Clda = clda;
2738: }
2740: if (!mat->matDescr) {
2741: stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743: PetscCallCUSPARSE(stat);
2744: }
2745: stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746: PetscCallCUSPARSE(stat);
2747: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750: mmdata->mmBufferSize = mmBufferSize;
2751: }
2752: mmdata->initialized = PETSC_TRUE;
2753: } else {
2754: /* to be safe, always update pointers of the mats */
2755: PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758: }
2760: /* do cusparseSpMM, which supports transpose on B */
2761: stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762: PetscCallCUSPARSE(stat);
2763: #else
2764: PetscInt k;
2765: /* cusparseXcsrmm does not support transpose on B */
2766: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767: cublasHandle_t cublasv2handle;
2768: cublasStatus_t cerr;
2770: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772: PetscCallCUBLAS(cerr);
2773: blda = B->cmap->n;
2774: k = B->cmap->n;
2775: } else {
2776: k = B->rmap->n;
2777: }
2779: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781: PetscCallCUSPARSE(stat);
2782: #endif
2783: PetscCall(PetscLogGpuTimeEnd());
2784: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786: if (product->type == MATPRODUCT_RARt) {
2787: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789: } else if (product->type == MATPRODUCT_PtAP) {
2790: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792: } else {
2793: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794: }
2795: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797: PetscFunctionReturn(PETSC_SUCCESS);
2798: }
2800: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801: {
2802: Mat_Product *product = C->product;
2803: Mat A, B;
2804: PetscInt m, n;
2805: PetscBool cisdense, flg;
2806: MatMatCusparse *mmdata;
2807: Mat_SeqAIJCUSPARSE *cusp;
2809: PetscFunctionBegin;
2810: MatCheckProduct(C, 1);
2811: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812: A = product->A;
2813: B = product->B;
2814: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818: switch (product->type) {
2819: case MATPRODUCT_AB:
2820: m = A->rmap->n;
2821: n = B->cmap->n;
2822: break;
2823: case MATPRODUCT_AtB:
2824: m = A->cmap->n;
2825: n = B->cmap->n;
2826: break;
2827: case MATPRODUCT_ABt:
2828: m = A->rmap->n;
2829: n = B->rmap->n;
2830: break;
2831: case MATPRODUCT_PtAP:
2832: m = B->cmap->n;
2833: n = B->cmap->n;
2834: break;
2835: case MATPRODUCT_RARt:
2836: m = B->rmap->n;
2837: n = B->rmap->n;
2838: break;
2839: default:
2840: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841: }
2842: PetscCall(MatSetSizes(C, m, n, m, n));
2843: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2847: /* product data */
2848: PetscCall(PetscNew(&mmdata));
2849: mmdata->cisdense = cisdense;
2850: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853: #endif
2854: /* for these products we need intermediate storage */
2855: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860: } else {
2861: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862: }
2863: }
2864: C->product->data = mmdata;
2865: C->product->destroy = MatDestroy_MatMatCusparse;
2867: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868: PetscFunctionReturn(PETSC_SUCCESS);
2869: }
2871: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872: {
2873: Mat_Product *product = C->product;
2874: Mat A, B;
2875: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2876: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2877: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2879: PetscBool flg;
2880: cusparseStatus_t stat;
2881: MatProductType ptype;
2882: MatMatCusparse *mmdata;
2883: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884: cusparseSpMatDescr_t BmatSpDescr;
2885: #endif
2886: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2888: PetscFunctionBegin;
2889: MatCheckProduct(C, 1);
2890: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893: mmdata = (MatMatCusparse *)C->product->data;
2894: A = product->A;
2895: B = product->B;
2896: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897: mmdata->reusesym = PETSC_FALSE;
2898: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900: Cmat = Ccusp->mat;
2901: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902: Ccsr = (CsrMatrix *)Cmat->mat;
2903: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904: goto finalize;
2905: }
2906: if (!c->nz) goto finalize;
2907: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2922: ptype = product->type;
2923: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924: ptype = MATPRODUCT_AB;
2925: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926: }
2927: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928: ptype = MATPRODUCT_AB;
2929: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930: }
2931: switch (ptype) {
2932: case MATPRODUCT_AB:
2933: Amat = Acusp->mat;
2934: Bmat = Bcusp->mat;
2935: break;
2936: case MATPRODUCT_AtB:
2937: Amat = Acusp->matTranspose;
2938: Bmat = Bcusp->mat;
2939: break;
2940: case MATPRODUCT_ABt:
2941: Amat = Acusp->mat;
2942: Bmat = Bcusp->matTranspose;
2943: break;
2944: default:
2945: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946: }
2947: Cmat = Ccusp->mat;
2948: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951: Acsr = (CsrMatrix *)Amat->mat;
2952: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953: Ccsr = (CsrMatrix *)Cmat->mat;
2954: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957: PetscCall(PetscLogGpuTimeBegin());
2958: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963: PetscCallCUSPARSE(stat);
2964: #else
2965: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966: PetscCallCUSPARSE(stat);
2967: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968: PetscCallCUSPARSE(stat);
2969: #endif
2970: #else
2971: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973: PetscCallCUSPARSE(stat);
2974: #endif
2975: PetscCall(PetscLogGpuFlops(mmdata->flops));
2976: PetscCallCUDA(WaitForCUDA());
2977: PetscCall(PetscLogGpuTimeEnd());
2978: C->offloadmask = PETSC_OFFLOAD_GPU;
2979: finalize:
2980: /* shorter version of MatAssemblyEnd_SeqAIJ */
2981: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984: c->reallocs = 0;
2985: C->info.mallocs += 0;
2986: C->info.nz_unneeded = 0;
2987: C->assembled = C->was_assembled = PETSC_TRUE;
2988: C->num_ass++;
2989: PetscFunctionReturn(PETSC_SUCCESS);
2990: }
2992: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993: {
2994: Mat_Product *product = C->product;
2995: Mat A, B;
2996: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2997: Mat_SeqAIJ *a, *b, *c;
2998: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3000: PetscInt i, j, m, n, k;
3001: PetscBool flg;
3002: cusparseStatus_t stat;
3003: MatProductType ptype;
3004: MatMatCusparse *mmdata;
3005: PetscLogDouble flops;
3006: PetscBool biscompressed, ciscompressed;
3007: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3009: cusparseSpMatDescr_t BmatSpDescr;
3010: #else
3011: int cnz;
3012: #endif
3013: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3015: PetscFunctionBegin;
3016: MatCheckProduct(C, 1);
3017: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018: A = product->A;
3019: B = product->B;
3020: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024: a = (Mat_SeqAIJ *)A->data;
3025: b = (Mat_SeqAIJ *)B->data;
3026: /* product data */
3027: PetscCall(PetscNew(&mmdata));
3028: C->product->data = mmdata;
3029: C->product->destroy = MatDestroy_MatMatCusparse;
3031: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3038: ptype = product->type;
3039: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040: ptype = MATPRODUCT_AB;
3041: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042: }
3043: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044: ptype = MATPRODUCT_AB;
3045: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046: }
3047: biscompressed = PETSC_FALSE;
3048: ciscompressed = PETSC_FALSE;
3049: switch (ptype) {
3050: case MATPRODUCT_AB:
3051: m = A->rmap->n;
3052: n = B->cmap->n;
3053: k = A->cmap->n;
3054: Amat = Acusp->mat;
3055: Bmat = Bcusp->mat;
3056: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058: break;
3059: case MATPRODUCT_AtB:
3060: m = A->cmap->n;
3061: n = B->cmap->n;
3062: k = A->rmap->n;
3063: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064: Amat = Acusp->matTranspose;
3065: Bmat = Bcusp->mat;
3066: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067: break;
3068: case MATPRODUCT_ABt:
3069: m = A->rmap->n;
3070: n = B->rmap->n;
3071: k = A->cmap->n;
3072: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073: Amat = Acusp->mat;
3074: Bmat = Bcusp->matTranspose;
3075: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076: break;
3077: default:
3078: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079: }
3081: /* create cusparse matrix */
3082: PetscCall(MatSetSizes(C, m, n, m, n));
3083: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084: c = (Mat_SeqAIJ *)C->data;
3085: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3087: Ccsr = new CsrMatrix;
3089: c->compressedrow.use = ciscompressed;
3090: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091: c->compressedrow.nrows = a->compressedrow.nrows;
3092: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3095: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097: } else {
3098: c->compressedrow.nrows = 0;
3099: c->compressedrow.i = NULL;
3100: c->compressedrow.rindex = NULL;
3101: Ccusp->workVector = NULL;
3102: Cmat->cprowIndices = NULL;
3103: }
3104: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3105: Ccusp->mat = Cmat;
3106: Ccusp->mat->mat = Ccsr;
3107: Ccsr->num_rows = Ccusp->nrows;
3108: Ccsr->num_cols = n;
3109: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113: PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121: c->nz = 0;
3122: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123: Ccsr->values = new THRUSTARRAY(c->nz);
3124: goto finalizesym;
3125: }
3127: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129: Acsr = (CsrMatrix *)Amat->mat;
3130: if (!biscompressed) {
3131: Bcsr = (CsrMatrix *)Bmat->mat;
3132: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133: BmatSpDescr = Bmat->matDescr;
3134: #endif
3135: } else { /* we need to use row offsets for the full matrix */
3136: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3137: Bcsr = new CsrMatrix;
3138: Bcsr->num_rows = B->rmap->n;
3139: Bcsr->num_cols = cBcsr->num_cols;
3140: Bcsr->num_entries = cBcsr->num_entries;
3141: Bcsr->column_indices = cBcsr->column_indices;
3142: Bcsr->values = cBcsr->values;
3143: if (!Bcusp->rowoffsets_gpu) {
3144: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147: }
3148: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149: mmdata->Bcsr = Bcsr;
3150: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151: if (Bcsr->num_rows && Bcsr->num_cols) {
3152: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153: PetscCallCUSPARSE(stat);
3154: }
3155: BmatSpDescr = mmdata->matSpBDescr;
3156: #endif
3157: }
3158: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160: /* precompute flops count */
3161: if (ptype == MATPRODUCT_AB) {
3162: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163: const PetscInt st = a->i[i];
3164: const PetscInt en = a->i[i + 1];
3165: for (j = st; j < en; j++) {
3166: const PetscInt brow = a->j[j];
3167: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168: }
3169: }
3170: } else if (ptype == MATPRODUCT_AtB) {
3171: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172: const PetscInt anzi = a->i[i + 1] - a->i[i];
3173: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174: flops += (2. * anzi) * bnzi;
3175: }
3176: } else { /* TODO */
3177: flops = 0.;
3178: }
3180: mmdata->flops = flops;
3181: PetscCall(PetscLogGpuTimeBegin());
3183: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185: // cuda-12.2 requires non-null csrRowOffsets
3186: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3187: PetscCallCUSPARSE(stat);
3188: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3189: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3190: {
3191: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3192: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3193: */
3194: void *dBuffer1 = NULL;
3195: void *dBuffer2 = NULL;
3196: void *dBuffer3 = NULL;
3197: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3198: size_t bufferSize1 = 0;
3199: size_t bufferSize2 = 0;
3200: size_t bufferSize3 = 0;
3201: size_t bufferSize4 = 0;
3202: size_t bufferSize5 = 0;
3204: /* ask bufferSize1 bytes for external memory */
3205: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3206: PetscCallCUSPARSE(stat);
3207: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3208: /* inspect the matrices A and B to understand the memory requirement for the next step */
3209: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3210: PetscCallCUSPARSE(stat);
3212: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3213: PetscCallCUSPARSE(stat);
3214: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3215: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3216: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3217: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3218: PetscCallCUSPARSE(stat);
3219: PetscCallCUDA(cudaFree(dBuffer1));
3220: PetscCallCUDA(cudaFree(dBuffer2));
3222: /* get matrix C non-zero entries C_nnz1 */
3223: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3224: c->nz = (PetscInt)C_nnz1;
3225: /* allocate matrix C */
3226: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3227: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228: Ccsr->values = new THRUSTARRAY(c->nz);
3229: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3230: /* update matC with the new pointers */
3231: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3232: PetscCallCUSPARSE(stat);
3234: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3235: PetscCallCUSPARSE(stat);
3236: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3237: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3238: PetscCallCUSPARSE(stat);
3239: PetscCallCUDA(cudaFree(dBuffer3));
3240: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3241: PetscCallCUSPARSE(stat);
3242: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3243: }
3244: #else
3245: size_t bufSize2;
3246: /* ask bufferSize bytes for external memory */
3247: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3248: PetscCallCUSPARSE(stat);
3249: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3250: /* inspect the matrices A and B to understand the memory requirement for the next step */
3251: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3252: PetscCallCUSPARSE(stat);
3253: /* ask bufferSize again bytes for external memory */
3254: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3255: PetscCallCUSPARSE(stat);
3256: /* The CUSPARSE documentation is not clear, nor the API
3257: We need both buffers to perform the operations properly!
3258: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3259: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3260: is stored in the descriptor! What a messy API... */
3261: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3262: /* compute the intermediate product of A * B */
3263: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3264: PetscCallCUSPARSE(stat);
3265: /* get matrix C non-zero entries C_nnz1 */
3266: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3267: c->nz = (PetscInt)C_nnz1;
3268: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3269: mmdata->mmBufferSize / 1024));
3270: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3271: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3272: Ccsr->values = new THRUSTARRAY(c->nz);
3273: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3274: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3275: PetscCallCUSPARSE(stat);
3276: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3277: PetscCallCUSPARSE(stat);
3278: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3279: #else
3280: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3281: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3282: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3283: PetscCallCUSPARSE(stat);
3284: c->nz = cnz;
3285: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3286: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287: Ccsr->values = new THRUSTARRAY(c->nz);
3288: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3290: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3291: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3292: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3293: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3294: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3295: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3296: PetscCallCUSPARSE(stat);
3297: #endif
3298: PetscCall(PetscLogGpuFlops(mmdata->flops));
3299: PetscCall(PetscLogGpuTimeEnd());
3300: finalizesym:
3301: c->singlemalloc = PETSC_FALSE;
3302: c->free_a = PETSC_TRUE;
3303: c->free_ij = PETSC_TRUE;
3304: PetscCall(PetscMalloc1(m + 1, &c->i));
3305: PetscCall(PetscMalloc1(c->nz, &c->j));
3306: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3307: PetscInt *d_i = c->i;
3308: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3309: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3310: ii = *Ccsr->row_offsets;
3311: jj = *Ccsr->column_indices;
3312: if (ciscompressed) d_i = c->compressedrow.i;
3313: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3315: } else {
3316: PetscInt *d_i = c->i;
3317: if (ciscompressed) d_i = c->compressedrow.i;
3318: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3320: }
3321: if (ciscompressed) { /* need to expand host row offsets */
3322: PetscInt r = 0;
3323: c->i[0] = 0;
3324: for (k = 0; k < c->compressedrow.nrows; k++) {
3325: const PetscInt next = c->compressedrow.rindex[k];
3326: const PetscInt old = c->compressedrow.i[k];
3327: for (; r < next; r++) c->i[r + 1] = old;
3328: }
3329: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3330: }
3331: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3332: PetscCall(PetscMalloc1(m, &c->ilen));
3333: PetscCall(PetscMalloc1(m, &c->imax));
3334: c->maxnz = c->nz;
3335: c->nonzerorowcnt = 0;
3336: c->rmax = 0;
3337: for (k = 0; k < m; k++) {
3338: const PetscInt nn = c->i[k + 1] - c->i[k];
3339: c->ilen[k] = c->imax[k] = nn;
3340: c->nonzerorowcnt += (PetscInt) !!nn;
3341: c->rmax = PetscMax(c->rmax, nn);
3342: }
3343: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3344: PetscCall(PetscMalloc1(c->nz, &c->a));
3345: Ccsr->num_entries = c->nz;
3347: C->nonzerostate++;
3348: PetscCall(PetscLayoutSetUp(C->rmap));
3349: PetscCall(PetscLayoutSetUp(C->cmap));
3350: Ccusp->nonzerostate = C->nonzerostate;
3351: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3352: C->preallocated = PETSC_TRUE;
3353: C->assembled = PETSC_FALSE;
3354: C->was_assembled = PETSC_FALSE;
3355: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3356: mmdata->reusesym = PETSC_TRUE;
3357: C->offloadmask = PETSC_OFFLOAD_GPU;
3358: }
3359: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3360: PetscFunctionReturn(PETSC_SUCCESS);
3361: }
3363: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3365: /* handles sparse or dense B */
3366: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3367: {
3368: Mat_Product *product = mat->product;
3369: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3371: PetscFunctionBegin;
3372: MatCheckProduct(mat, 1);
3373: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3374: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3375: if (product->type == MATPRODUCT_ABC) {
3376: Ciscusp = PETSC_FALSE;
3377: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3378: }
3379: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3380: PetscBool usecpu = PETSC_FALSE;
3381: switch (product->type) {
3382: case MATPRODUCT_AB:
3383: if (product->api_user) {
3384: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3385: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3386: PetscOptionsEnd();
3387: } else {
3388: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3389: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3390: PetscOptionsEnd();
3391: }
3392: break;
3393: case MATPRODUCT_AtB:
3394: if (product->api_user) {
3395: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3396: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3397: PetscOptionsEnd();
3398: } else {
3399: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3400: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3401: PetscOptionsEnd();
3402: }
3403: break;
3404: case MATPRODUCT_PtAP:
3405: if (product->api_user) {
3406: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3407: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3408: PetscOptionsEnd();
3409: } else {
3410: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3411: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3412: PetscOptionsEnd();
3413: }
3414: break;
3415: case MATPRODUCT_RARt:
3416: if (product->api_user) {
3417: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3418: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3419: PetscOptionsEnd();
3420: } else {
3421: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3422: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3423: PetscOptionsEnd();
3424: }
3425: break;
3426: case MATPRODUCT_ABC:
3427: if (product->api_user) {
3428: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3429: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3430: PetscOptionsEnd();
3431: } else {
3432: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3433: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3434: PetscOptionsEnd();
3435: }
3436: break;
3437: default:
3438: break;
3439: }
3440: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3441: }
3442: /* dispatch */
3443: if (isdense) {
3444: switch (product->type) {
3445: case MATPRODUCT_AB:
3446: case MATPRODUCT_AtB:
3447: case MATPRODUCT_ABt:
3448: case MATPRODUCT_PtAP:
3449: case MATPRODUCT_RARt:
3450: if (product->A->boundtocpu) {
3451: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3452: } else {
3453: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3454: }
3455: break;
3456: case MATPRODUCT_ABC:
3457: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3458: break;
3459: default:
3460: break;
3461: }
3462: } else if (Biscusp && Ciscusp) {
3463: switch (product->type) {
3464: case MATPRODUCT_AB:
3465: case MATPRODUCT_AtB:
3466: case MATPRODUCT_ABt:
3467: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3468: break;
3469: case MATPRODUCT_PtAP:
3470: case MATPRODUCT_RARt:
3471: case MATPRODUCT_ABC:
3472: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3473: break;
3474: default:
3475: break;
3476: }
3477: } else { /* fallback for AIJ */
3478: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3479: }
3480: PetscFunctionReturn(PETSC_SUCCESS);
3481: }
3483: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3484: {
3485: PetscFunctionBegin;
3486: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3487: PetscFunctionReturn(PETSC_SUCCESS);
3488: }
3490: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3491: {
3492: PetscFunctionBegin;
3493: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3494: PetscFunctionReturn(PETSC_SUCCESS);
3495: }
3497: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3498: {
3499: PetscFunctionBegin;
3500: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3501: PetscFunctionReturn(PETSC_SUCCESS);
3502: }
3504: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3505: {
3506: PetscFunctionBegin;
3507: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3508: PetscFunctionReturn(PETSC_SUCCESS);
3509: }
3511: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3512: {
3513: PetscFunctionBegin;
3514: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3515: PetscFunctionReturn(PETSC_SUCCESS);
3516: }
3518: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3519: {
3520: int i = blockIdx.x * blockDim.x + threadIdx.x;
3521: if (i < n) y[idx[i]] += x[i];
3522: }
3524: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3525: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3526: {
3527: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3528: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3529: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3530: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3531: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3532: PetscBool compressed;
3533: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3534: PetscInt nx, ny;
3535: #endif
3537: PetscFunctionBegin;
3538: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3539: if (!a->nz) {
3540: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3541: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3542: PetscFunctionReturn(PETSC_SUCCESS);
3543: }
3544: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3545: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3546: if (!trans) {
3547: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3548: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3549: } else {
3550: if (herm || !A->form_explicit_transpose) {
3551: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3552: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3553: } else {
3554: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3555: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3556: }
3557: }
3558: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3559: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3561: try {
3562: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3563: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3564: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3566: PetscCall(PetscLogGpuTimeBegin());
3567: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3568: /* z = A x + beta y.
3569: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3570: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3571: */
3572: xptr = xarray;
3573: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3574: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3575: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3576: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3577: allocated to accommodate different uses. So we get the length info directly from mat.
3578: */
3579: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3580: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3581: nx = mat->num_cols;
3582: ny = mat->num_rows;
3583: }
3584: #endif
3585: } else {
3586: /* z = A^T x + beta y
3587: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3588: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3589: */
3590: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3591: dptr = zarray;
3592: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3593: if (compressed) { /* Scatter x to work vector */
3594: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3596: thrust::for_each(
3597: #if PetscDefined(HAVE_THRUST_ASYNC)
3598: thrust::cuda::par.on(PetscDefaultCudaStream),
3599: #endif
3600: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3601: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3602: }
3603: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3604: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3605: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3606: nx = mat->num_rows;
3607: ny = mat->num_cols;
3608: }
3609: #endif
3610: }
3612: /* csr_spmv does y = alpha op(A) x + beta y */
3613: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3614: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3615: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3616: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3617: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3618: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3619: PetscCallCUSPARSE(
3620: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3621: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3623: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3624: } else {
3625: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3626: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3627: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3628: }
3630: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3631: matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3632: #else
3633: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3635: #endif
3636: } else {
3637: if (cusparsestruct->nrows) {
3638: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3639: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3640: #else
3641: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3642: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3643: #endif
3644: }
3645: }
3646: PetscCall(PetscLogGpuTimeEnd());
3648: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3649: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3650: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3651: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3652: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3653: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3654: }
3655: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3656: PetscCall(VecSeq_CUDA::Set(zz, 0));
3657: }
3659: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3660: if (compressed) {
3661: PetscCall(PetscLogGpuTimeBegin());
3662: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3663: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3664: prevent that. So I just add a ScatterAdd kernel.
3665: */
3666: #if 0
3667: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3668: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3669: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3670: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3671: VecCUDAPlusEquals());
3672: #else
3673: PetscInt n = matstruct->cprowIndices->size();
3674: ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3675: #endif
3676: PetscCall(PetscLogGpuTimeEnd());
3677: }
3678: } else {
3679: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3680: }
3681: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3682: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3683: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3684: } catch (char *ex) {
3685: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3686: }
3687: if (yy) {
3688: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3689: } else {
3690: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3691: }
3692: PetscFunctionReturn(PETSC_SUCCESS);
3693: }
3695: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3696: {
3697: PetscFunctionBegin;
3698: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3699: PetscFunctionReturn(PETSC_SUCCESS);
3700: }
3702: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3703: {
3704: PetscFunctionBegin;
3705: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3706: PetscFunctionReturn(PETSC_SUCCESS);
3707: }
3709: /*@
3710: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3711: (the default parallel PETSc format).
3713: Collective
3715: Input Parameters:
3716: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3717: . m - number of rows
3718: . n - number of columns
3719: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3720: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3722: Output Parameter:
3723: . A - the matrix
3725: Level: intermediate
3727: Notes:
3728: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3729: calculations. For good matrix assembly performance the user should preallocate the matrix
3730: storage by setting the parameter `nz` (or the array `nnz`).
3732: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3733: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3734: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3736: The AIJ format, also called
3737: compressed row storage, is fully compatible with standard Fortran
3738: storage. That is, the stored row and column indices can begin at
3739: either one (as in Fortran) or zero.
3741: Specify the preallocated storage with either nz or nnz (not both).
3742: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3743: allocation.
3745: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3746: @*/
3747: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3748: {
3749: PetscFunctionBegin;
3750: PetscCall(MatCreate(comm, A));
3751: PetscCall(MatSetSizes(*A, m, n, m, n));
3752: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3753: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3754: PetscFunctionReturn(PETSC_SUCCESS);
3755: }
3757: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3758: {
3759: PetscFunctionBegin;
3760: if (A->factortype == MAT_FACTOR_NONE) {
3761: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3762: } else {
3763: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3764: }
3765: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3766: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3767: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3768: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3769: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3770: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3771: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3772: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3773: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3774: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3775: PetscCall(MatDestroy_SeqAIJ(A));
3776: PetscFunctionReturn(PETSC_SUCCESS);
3777: }
3779: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3780: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3781: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3782: {
3783: PetscFunctionBegin;
3784: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3785: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3786: PetscFunctionReturn(PETSC_SUCCESS);
3787: }
3789: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3790: {
3791: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3792: Mat_SeqAIJCUSPARSE *cy;
3793: Mat_SeqAIJCUSPARSE *cx;
3794: PetscScalar *ay;
3795: const PetscScalar *ax;
3796: CsrMatrix *csry, *csrx;
3798: PetscFunctionBegin;
3799: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3800: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3801: if (X->ops->axpy != Y->ops->axpy) {
3802: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3803: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3804: PetscFunctionReturn(PETSC_SUCCESS);
3805: }
3806: /* if we are here, it means both matrices are bound to GPU */
3807: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3808: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3809: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3810: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3811: csry = (CsrMatrix *)cy->mat->mat;
3812: csrx = (CsrMatrix *)cx->mat->mat;
3813: /* see if we can turn this into a cublas axpy */
3814: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3815: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3816: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3817: if (eq) str = SAME_NONZERO_PATTERN;
3818: }
3819: /* spgeam is buggy with one column */
3820: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3822: if (str == SUBSET_NONZERO_PATTERN) {
3823: PetscScalar b = 1.0;
3824: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3825: size_t bufferSize;
3826: void *buffer;
3827: #endif
3829: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3830: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3831: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3832: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3833: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3834: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3835: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3836: PetscCall(PetscLogGpuTimeBegin());
3837: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3838: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3839: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3840: PetscCall(PetscLogGpuTimeEnd());
3841: PetscCallCUDA(cudaFree(buffer));
3842: #else
3843: PetscCall(PetscLogGpuTimeBegin());
3844: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3845: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3846: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3847: PetscCall(PetscLogGpuTimeEnd());
3848: #endif
3849: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3850: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3851: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3852: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3853: } else if (str == SAME_NONZERO_PATTERN) {
3854: cublasHandle_t cublasv2handle;
3855: PetscBLASInt one = 1, bnz = 1;
3857: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3858: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3859: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3860: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3861: PetscCall(PetscLogGpuTimeBegin());
3862: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3863: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3864: PetscCall(PetscLogGpuTimeEnd());
3865: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3866: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3867: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3868: } else {
3869: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3870: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3871: }
3872: PetscFunctionReturn(PETSC_SUCCESS);
3873: }
3875: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3876: {
3877: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3878: PetscScalar *ay;
3879: cublasHandle_t cublasv2handle;
3880: PetscBLASInt one = 1, bnz = 1;
3882: PetscFunctionBegin;
3883: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3884: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3885: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3886: PetscCall(PetscLogGpuTimeBegin());
3887: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3888: PetscCall(PetscLogGpuFlops(bnz));
3889: PetscCall(PetscLogGpuTimeEnd());
3890: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3891: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3892: PetscFunctionReturn(PETSC_SUCCESS);
3893: }
3895: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3896: {
3897: PetscBool both = PETSC_FALSE;
3898: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3900: PetscFunctionBegin;
3901: if (A->factortype == MAT_FACTOR_NONE) {
3902: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3903: if (spptr->mat) {
3904: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3905: if (matrix->values) {
3906: both = PETSC_TRUE;
3907: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3908: }
3909: }
3910: if (spptr->matTranspose) {
3911: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3912: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3913: }
3914: }
3915: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3916: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3917: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3918: else A->offloadmask = PETSC_OFFLOAD_CPU;
3919: PetscFunctionReturn(PETSC_SUCCESS);
3920: }
3922: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3923: {
3924: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3926: PetscFunctionBegin;
3927: if (A->factortype != MAT_FACTOR_NONE) {
3928: A->boundtocpu = flg;
3929: PetscFunctionReturn(PETSC_SUCCESS);
3930: }
3931: if (flg) {
3932: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3934: A->ops->scale = MatScale_SeqAIJ;
3935: A->ops->axpy = MatAXPY_SeqAIJ;
3936: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3937: A->ops->mult = MatMult_SeqAIJ;
3938: A->ops->multadd = MatMultAdd_SeqAIJ;
3939: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3940: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3941: A->ops->multhermitiantranspose = NULL;
3942: A->ops->multhermitiantransposeadd = NULL;
3943: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3944: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3945: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3946: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3947: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3948: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3949: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3950: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3951: } else {
3952: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3953: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3954: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3955: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3956: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3957: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3958: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3959: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3960: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3961: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3962: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3963: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3964: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3965: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3966: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3967: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3968: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3970: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3971: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3972: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3973: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3974: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3975: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3976: }
3977: A->boundtocpu = flg;
3978: if (flg && a->inode.size) {
3979: a->inode.use = PETSC_TRUE;
3980: } else {
3981: a->inode.use = PETSC_FALSE;
3982: }
3983: PetscFunctionReturn(PETSC_SUCCESS);
3984: }
3986: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3987: {
3988: Mat B;
3990: PetscFunctionBegin;
3991: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3992: if (reuse == MAT_INITIAL_MATRIX) {
3993: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3994: } else if (reuse == MAT_REUSE_MATRIX) {
3995: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3996: }
3997: B = *newmat;
3999: PetscCall(PetscFree(B->defaultvectype));
4000: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4002: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4003: if (B->factortype == MAT_FACTOR_NONE) {
4004: Mat_SeqAIJCUSPARSE *spptr;
4005: PetscCall(PetscNew(&spptr));
4006: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4007: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4008: spptr->format = MAT_CUSPARSE_CSR;
4009: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4010: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4011: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4012: #else
4013: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4014: #endif
4015: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4016: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4017: #endif
4018: B->spptr = spptr;
4019: } else {
4020: Mat_SeqAIJCUSPARSETriFactors *spptr;
4022: PetscCall(PetscNew(&spptr));
4023: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4024: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4025: B->spptr = spptr;
4026: }
4027: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4028: }
4029: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4030: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4031: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4032: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4033: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4034: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4036: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4037: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4038: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4039: #if defined(PETSC_HAVE_HYPRE)
4040: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4041: #endif
4042: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4043: PetscFunctionReturn(PETSC_SUCCESS);
4044: }
4046: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4047: {
4048: PetscFunctionBegin;
4049: PetscCall(MatCreate_SeqAIJ(B));
4050: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4051: PetscFunctionReturn(PETSC_SUCCESS);
4052: }
4054: /*MC
4055: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4057: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4058: CSR, ELL, or Hybrid format.
4059: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4061: Options Database Keys:
4062: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4063: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4064: Other options include ell (ellpack) or hyb (hybrid).
4065: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4066: - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4068: Level: beginner
4070: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4071: M*/
4073: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4074: {
4075: PetscFunctionBegin;
4076: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4077: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4078: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4079: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4081: PetscFunctionReturn(PETSC_SUCCESS);
4082: }
4084: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4085: {
4086: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4088: PetscFunctionBegin;
4089: if (cusp) {
4090: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4091: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4092: delete cusp->workVector;
4093: delete cusp->rowoffsets_gpu;
4094: delete cusp->csr2csc_i;
4095: delete cusp->coords;
4096: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4097: PetscCall(PetscFree(mat->spptr));
4098: }
4099: PetscFunctionReturn(PETSC_SUCCESS);
4100: }
4102: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4103: {
4104: PetscFunctionBegin;
4105: if (*mat) {
4106: delete (*mat)->values;
4107: delete (*mat)->column_indices;
4108: delete (*mat)->row_offsets;
4109: delete *mat;
4110: *mat = 0;
4111: }
4112: PetscFunctionReturn(PETSC_SUCCESS);
4113: }
4115: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4116: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4117: {
4118: PetscFunctionBegin;
4119: if (*trifactor) {
4120: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4121: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4122: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4123: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4124: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4125: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4126: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4127: #endif
4128: PetscCall(PetscFree(*trifactor));
4129: }
4130: PetscFunctionReturn(PETSC_SUCCESS);
4131: }
4132: #endif
4134: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4135: {
4136: CsrMatrix *mat;
4138: PetscFunctionBegin;
4139: if (*matstruct) {
4140: if ((*matstruct)->mat) {
4141: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4142: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4143: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4144: #else
4145: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4146: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4147: #endif
4148: } else {
4149: mat = (CsrMatrix *)(*matstruct)->mat;
4150: PetscCall(CsrMatrix_Destroy(&mat));
4151: }
4152: }
4153: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4154: delete (*matstruct)->cprowIndices;
4155: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4156: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4157: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4159: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4160: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4161: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4162: for (int i = 0; i < 3; i++) {
4163: if (mdata->cuSpMV[i].initialized) {
4164: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4165: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4166: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4167: }
4168: }
4169: #endif
4170: delete *matstruct;
4171: *matstruct = NULL;
4172: }
4173: PetscFunctionReturn(PETSC_SUCCESS);
4174: }
4176: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4177: {
4178: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4180: PetscFunctionBegin;
4181: if (fs) {
4182: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4183: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4184: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4185: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4186: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4187: delete fs->workVector;
4188: fs->workVector = NULL;
4189: #endif
4190: delete fs->rpermIndices;
4191: delete fs->cpermIndices;
4192: fs->rpermIndices = NULL;
4193: fs->cpermIndices = NULL;
4194: fs->init_dev_prop = PETSC_FALSE;
4195: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4196: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4197: PetscCallCUDA(cudaFree(fs->csrColIdx));
4198: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4199: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4200: PetscCallCUDA(cudaFree(fs->csrVal));
4201: PetscCallCUDA(cudaFree(fs->diag));
4202: PetscCallCUDA(cudaFree(fs->X));
4203: PetscCallCUDA(cudaFree(fs->Y));
4204: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4205: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4206: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4207: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4208: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4209: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4210: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4211: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4212: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4213: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4214: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4215: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4216: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4217: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4218: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4219: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4220: PetscCall(PetscFree(fs->csrRowPtr_h));
4221: PetscCall(PetscFree(fs->csrVal_h));
4222: PetscCall(PetscFree(fs->diag_h));
4223: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4224: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4225: #endif
4226: }
4227: PetscFunctionReturn(PETSC_SUCCESS);
4228: }
4230: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4231: {
4232: PetscFunctionBegin;
4233: if (*trifactors) {
4234: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4235: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4236: PetscCall(PetscFree(*trifactors));
4237: }
4238: PetscFunctionReturn(PETSC_SUCCESS);
4239: }
4241: struct IJCompare {
4242: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4243: {
4244: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4245: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4246: return false;
4247: }
4248: };
4250: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4251: {
4252: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4254: PetscFunctionBegin;
4255: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4256: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4257: if (destroy) {
4258: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4259: delete cusp->csr2csc_i;
4260: cusp->csr2csc_i = NULL;
4261: }
4262: A->transupdated = PETSC_FALSE;
4263: PetscFunctionReturn(PETSC_SUCCESS);
4264: }
4266: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4267: {
4268: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4269: PetscFunctionBegin;
4270: PetscCallCUDA(cudaFree(coo->perm));
4271: PetscCallCUDA(cudaFree(coo->jmap));
4272: PetscCall(PetscFree(coo));
4273: PetscFunctionReturn(PETSC_SUCCESS);
4274: }
4276: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4277: {
4278: PetscBool dev_ij = PETSC_FALSE;
4279: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4280: PetscInt *i, *j;
4281: PetscContainer container_h, container_d;
4282: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4284: PetscFunctionBegin;
4285: // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4286: PetscCall(PetscGetMemType(coo_i, &mtype));
4287: if (PetscMemTypeDevice(mtype)) {
4288: dev_ij = PETSC_TRUE;
4289: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4290: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4291: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4292: } else {
4293: i = coo_i;
4294: j = coo_j;
4295: }
4297: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4298: if (dev_ij) PetscCall(PetscFree2(i, j));
4299: mat->offloadmask = PETSC_OFFLOAD_CPU;
4300: // Create the GPU memory
4301: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4303: // Copy the COO struct to device
4304: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4305: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4306: PetscCall(PetscMalloc1(1, &coo_d));
4307: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4308: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4309: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4310: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4311: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4313: // Put the COO struct in a container and then attach that to the matrix
4314: PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
4315: PetscCall(PetscContainerSetPointer(container_d, coo_d));
4316: PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4317: PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
4318: PetscCall(PetscContainerDestroy(&container_d));
4319: PetscFunctionReturn(PETSC_SUCCESS);
4320: }
4322: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4323: {
4324: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4325: const PetscCount grid_size = gridDim.x * blockDim.x;
4326: for (; i < nnz; i += grid_size) {
4327: PetscScalar sum = 0.0;
4328: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4329: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4330: }
4331: }
4333: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4334: {
4335: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4336: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4337: PetscCount Annz = seq->nz;
4338: PetscMemType memtype;
4339: const PetscScalar *v1 = v;
4340: PetscScalar *Aa;
4341: PetscContainer container;
4342: MatCOOStruct_SeqAIJ *coo;
4344: PetscFunctionBegin;
4345: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4347: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4348: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4350: PetscCall(PetscGetMemType(v, &memtype));
4351: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4352: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4353: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4354: }
4356: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4357: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4359: PetscCall(PetscLogGpuTimeBegin());
4360: if (Annz) {
4361: MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4362: PetscCallCUDA(cudaPeekAtLastError());
4363: }
4364: PetscCall(PetscLogGpuTimeEnd());
4366: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4367: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4369: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4370: PetscFunctionReturn(PETSC_SUCCESS);
4371: }
4373: /*@C
4374: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4376: Not Collective
4378: Input Parameters:
4379: + A - the matrix
4380: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4382: Output Parameters:
4383: + i - the CSR row pointers
4384: - j - the CSR column indices
4386: Level: developer
4388: Note:
4389: When compressed is true, the CSR structure does not contain empty rows
4391: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4392: @*/
4393: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4394: {
4395: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4396: CsrMatrix *csr;
4397: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4399: PetscFunctionBegin;
4401: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4402: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4403: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4404: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4405: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4406: csr = (CsrMatrix *)cusp->mat->mat;
4407: if (i) {
4408: if (!compressed && a->compressedrow.use) { /* need full row offset */
4409: if (!cusp->rowoffsets_gpu) {
4410: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4411: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4412: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4413: }
4414: *i = cusp->rowoffsets_gpu->data().get();
4415: } else *i = csr->row_offsets->data().get();
4416: }
4417: if (j) *j = csr->column_indices->data().get();
4418: PetscFunctionReturn(PETSC_SUCCESS);
4419: }
4421: /*@C
4422: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4424: Not Collective
4426: Input Parameters:
4427: + A - the matrix
4428: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4429: . i - the CSR row pointers
4430: - j - the CSR column indices
4432: Level: developer
4434: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4435: @*/
4436: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4437: {
4438: PetscFunctionBegin;
4440: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4441: if (i) *i = NULL;
4442: if (j) *j = NULL;
4443: (void)compressed;
4444: PetscFunctionReturn(PETSC_SUCCESS);
4445: }
4447: /*@C
4448: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4450: Not Collective
4452: Input Parameter:
4453: . A - a `MATSEQAIJCUSPARSE` matrix
4455: Output Parameter:
4456: . a - pointer to the device data
4458: Level: developer
4460: Note:
4461: May trigger host-device copies if up-to-date matrix data is on host
4463: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4464: @*/
4465: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4466: {
4467: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4468: CsrMatrix *csr;
4470: PetscFunctionBegin;
4472: PetscAssertPointer(a, 2);
4473: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4474: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4475: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4476: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4477: csr = (CsrMatrix *)cusp->mat->mat;
4478: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4479: *a = csr->values->data().get();
4480: PetscFunctionReturn(PETSC_SUCCESS);
4481: }
4483: /*@C
4484: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4486: Not Collective
4488: Input Parameters:
4489: + A - a `MATSEQAIJCUSPARSE` matrix
4490: - a - pointer to the device data
4492: Level: developer
4494: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4495: @*/
4496: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4497: {
4498: PetscFunctionBegin;
4500: PetscAssertPointer(a, 2);
4501: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4502: *a = NULL;
4503: PetscFunctionReturn(PETSC_SUCCESS);
4504: }
4506: /*@C
4507: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4509: Not Collective
4511: Input Parameter:
4512: . A - a `MATSEQAIJCUSPARSE` matrix
4514: Output Parameter:
4515: . a - pointer to the device data
4517: Level: developer
4519: Note:
4520: May trigger host-device copies if up-to-date matrix data is on host
4522: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4523: @*/
4524: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4525: {
4526: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4527: CsrMatrix *csr;
4529: PetscFunctionBegin;
4531: PetscAssertPointer(a, 2);
4532: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4533: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4534: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4535: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4536: csr = (CsrMatrix *)cusp->mat->mat;
4537: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4538: *a = csr->values->data().get();
4539: A->offloadmask = PETSC_OFFLOAD_GPU;
4540: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4541: PetscFunctionReturn(PETSC_SUCCESS);
4542: }
4543: /*@C
4544: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4546: Not Collective
4548: Input Parameters:
4549: + A - a `MATSEQAIJCUSPARSE` matrix
4550: - a - pointer to the device data
4552: Level: developer
4554: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4555: @*/
4556: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4557: {
4558: PetscFunctionBegin;
4560: PetscAssertPointer(a, 2);
4561: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4562: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4563: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4564: *a = NULL;
4565: PetscFunctionReturn(PETSC_SUCCESS);
4566: }
4568: /*@C
4569: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4571: Not Collective
4573: Input Parameter:
4574: . A - a `MATSEQAIJCUSPARSE` matrix
4576: Output Parameter:
4577: . a - pointer to the device data
4579: Level: developer
4581: Note:
4582: Does not trigger host-device copies and flags data validity on the GPU
4584: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4585: @*/
4586: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4587: {
4588: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4589: CsrMatrix *csr;
4591: PetscFunctionBegin;
4593: PetscAssertPointer(a, 2);
4594: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4595: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4596: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4597: csr = (CsrMatrix *)cusp->mat->mat;
4598: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4599: *a = csr->values->data().get();
4600: A->offloadmask = PETSC_OFFLOAD_GPU;
4601: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4602: PetscFunctionReturn(PETSC_SUCCESS);
4603: }
4605: /*@C
4606: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4608: Not Collective
4610: Input Parameters:
4611: + A - a `MATSEQAIJCUSPARSE` matrix
4612: - a - pointer to the device data
4614: Level: developer
4616: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4617: @*/
4618: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4619: {
4620: PetscFunctionBegin;
4622: PetscAssertPointer(a, 2);
4623: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4625: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4626: *a = NULL;
4627: PetscFunctionReturn(PETSC_SUCCESS);
4628: }
4630: struct IJCompare4 {
4631: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4632: {
4633: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4634: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4635: return false;
4636: }
4637: };
4639: struct Shift {
4640: int _shift;
4642: Shift(int shift) : _shift(shift) { }
4643: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4644: };
4646: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4647: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4648: {
4649: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4650: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4651: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4652: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4653: PetscInt Annz, Bnnz;
4654: cusparseStatus_t stat;
4655: PetscInt i, m, n, zero = 0;
4657: PetscFunctionBegin;
4660: PetscAssertPointer(C, 4);
4661: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4662: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4663: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4664: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4665: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4666: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4667: if (reuse == MAT_INITIAL_MATRIX) {
4668: m = A->rmap->n;
4669: n = A->cmap->n + B->cmap->n;
4670: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4671: PetscCall(MatSetSizes(*C, m, n, m, n));
4672: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4673: c = (Mat_SeqAIJ *)(*C)->data;
4674: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4675: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4676: Ccsr = new CsrMatrix;
4677: Cmat->cprowIndices = NULL;
4678: c->compressedrow.use = PETSC_FALSE;
4679: c->compressedrow.nrows = 0;
4680: c->compressedrow.i = NULL;
4681: c->compressedrow.rindex = NULL;
4682: Ccusp->workVector = NULL;
4683: Ccusp->nrows = m;
4684: Ccusp->mat = Cmat;
4685: Ccusp->mat->mat = Ccsr;
4686: Ccsr->num_rows = m;
4687: Ccsr->num_cols = n;
4688: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4689: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4690: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4691: PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4692: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4693: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4694: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4695: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4696: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4697: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4698: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4699: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4700: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4702: Acsr = (CsrMatrix *)Acusp->mat->mat;
4703: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4704: Annz = (PetscInt)Acsr->column_indices->size();
4705: Bnnz = (PetscInt)Bcsr->column_indices->size();
4706: c->nz = Annz + Bnnz;
4707: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4708: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4709: Ccsr->values = new THRUSTARRAY(c->nz);
4710: Ccsr->num_entries = c->nz;
4711: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4712: if (c->nz) {
4713: auto Acoo = new THRUSTINTARRAY32(Annz);
4714: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4715: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4716: THRUSTINTARRAY32 *Aroff, *Broff;
4718: if (a->compressedrow.use) { /* need full row offset */
4719: if (!Acusp->rowoffsets_gpu) {
4720: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4721: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4722: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4723: }
4724: Aroff = Acusp->rowoffsets_gpu;
4725: } else Aroff = Acsr->row_offsets;
4726: if (b->compressedrow.use) { /* need full row offset */
4727: if (!Bcusp->rowoffsets_gpu) {
4728: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4729: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4730: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4731: }
4732: Broff = Bcusp->rowoffsets_gpu;
4733: } else Broff = Bcsr->row_offsets;
4734: PetscCall(PetscLogGpuTimeBegin());
4735: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4736: PetscCallCUSPARSE(stat);
4737: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4738: PetscCallCUSPARSE(stat);
4739: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4740: auto Aperm = thrust::make_constant_iterator(1);
4741: auto Bperm = thrust::make_constant_iterator(0);
4742: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4743: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4744: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4745: #else
4746: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4747: auto Bcib = Bcsr->column_indices->begin();
4748: auto Bcie = Bcsr->column_indices->end();
4749: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4750: #endif
4751: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4752: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4753: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4754: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4755: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4756: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4757: auto p1 = Ccusp->coords->begin();
4758: auto p2 = Ccusp->coords->begin();
4759: thrust::advance(p2, Annz);
4760: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4761: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4762: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4763: #endif
4764: auto cci = thrust::make_counting_iterator(zero);
4765: auto cce = thrust::make_counting_iterator(c->nz);
4766: #if 0 //Errors on SUMMIT cuda 11.1.0
4767: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4768: #else
4769: auto pred = thrust::identity<int>();
4770: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4771: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4772: #endif
4773: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4774: PetscCallCUSPARSE(stat);
4775: PetscCall(PetscLogGpuTimeEnd());
4776: delete wPerm;
4777: delete Acoo;
4778: delete Bcoo;
4779: delete Ccoo;
4780: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4781: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4782: PetscCallCUSPARSE(stat);
4783: #endif
4784: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4785: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4786: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4787: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4788: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4789: CsrMatrix *CcsrT = new CsrMatrix;
4790: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4791: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4793: (*C)->form_explicit_transpose = PETSC_TRUE;
4794: (*C)->transupdated = PETSC_TRUE;
4795: Ccusp->rowoffsets_gpu = NULL;
4796: CmatT->cprowIndices = NULL;
4797: CmatT->mat = CcsrT;
4798: CcsrT->num_rows = n;
4799: CcsrT->num_cols = m;
4800: CcsrT->num_entries = c->nz;
4802: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4803: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4804: CcsrT->values = new THRUSTARRAY(c->nz);
4806: PetscCall(PetscLogGpuTimeBegin());
4807: auto rT = CcsrT->row_offsets->begin();
4808: if (AT) {
4809: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4810: thrust::advance(rT, -1);
4811: }
4812: if (BT) {
4813: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4814: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4815: thrust::copy(titb, tite, rT);
4816: }
4817: auto cT = CcsrT->column_indices->begin();
4818: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4819: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4820: auto vT = CcsrT->values->begin();
4821: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4822: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4823: PetscCall(PetscLogGpuTimeEnd());
4825: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4826: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4827: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4828: PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4829: PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4830: PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4831: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4832: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4833: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4834: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4835: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4836: PetscCallCUSPARSE(stat);
4837: #endif
4838: Ccusp->matTranspose = CmatT;
4839: }
4840: }
4842: c->singlemalloc = PETSC_FALSE;
4843: c->free_a = PETSC_TRUE;
4844: c->free_ij = PETSC_TRUE;
4845: PetscCall(PetscMalloc1(m + 1, &c->i));
4846: PetscCall(PetscMalloc1(c->nz, &c->j));
4847: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4848: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4849: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4850: ii = *Ccsr->row_offsets;
4851: jj = *Ccsr->column_indices;
4852: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4853: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4854: } else {
4855: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4856: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4857: }
4858: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4859: PetscCall(PetscMalloc1(m, &c->ilen));
4860: PetscCall(PetscMalloc1(m, &c->imax));
4861: c->maxnz = c->nz;
4862: c->nonzerorowcnt = 0;
4863: c->rmax = 0;
4864: for (i = 0; i < m; i++) {
4865: const PetscInt nn = c->i[i + 1] - c->i[i];
4866: c->ilen[i] = c->imax[i] = nn;
4867: c->nonzerorowcnt += (PetscInt) !!nn;
4868: c->rmax = PetscMax(c->rmax, nn);
4869: }
4870: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4871: PetscCall(PetscMalloc1(c->nz, &c->a));
4872: (*C)->nonzerostate++;
4873: PetscCall(PetscLayoutSetUp((*C)->rmap));
4874: PetscCall(PetscLayoutSetUp((*C)->cmap));
4875: Ccusp->nonzerostate = (*C)->nonzerostate;
4876: (*C)->preallocated = PETSC_TRUE;
4877: } else {
4878: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4879: c = (Mat_SeqAIJ *)(*C)->data;
4880: if (c->nz) {
4881: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4882: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4883: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4884: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4885: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4886: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4887: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4888: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4889: Acsr = (CsrMatrix *)Acusp->mat->mat;
4890: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4891: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4892: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4893: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4894: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4895: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4896: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4897: auto pmid = Ccusp->coords->begin();
4898: thrust::advance(pmid, Acsr->num_entries);
4899: PetscCall(PetscLogGpuTimeBegin());
4900: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4901: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4902: thrust::for_each(zibait, zieait, VecCUDAEquals());
4903: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4904: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4905: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4906: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4907: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4908: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4909: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4910: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4911: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4912: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4913: auto vT = CcsrT->values->begin();
4914: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4915: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4916: (*C)->transupdated = PETSC_TRUE;
4917: }
4918: PetscCall(PetscLogGpuTimeEnd());
4919: }
4920: }
4921: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4922: (*C)->assembled = PETSC_TRUE;
4923: (*C)->was_assembled = PETSC_FALSE;
4924: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4925: PetscFunctionReturn(PETSC_SUCCESS);
4926: }
4928: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4929: {
4930: bool dmem;
4931: const PetscScalar *av;
4933: PetscFunctionBegin;
4934: dmem = isCudaMem(v);
4935: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4936: if (n && idx) {
4937: THRUSTINTARRAY widx(n);
4938: widx.assign(idx, idx + n);
4939: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4941: THRUSTARRAY *w = NULL;
4942: thrust::device_ptr<PetscScalar> dv;
4943: if (dmem) {
4944: dv = thrust::device_pointer_cast(v);
4945: } else {
4946: w = new THRUSTARRAY(n);
4947: dv = w->data();
4948: }
4949: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4951: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4952: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4953: thrust::for_each(zibit, zieit, VecCUDAEquals());
4954: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4955: delete w;
4956: } else {
4957: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4958: }
4959: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4960: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4961: PetscFunctionReturn(PETSC_SUCCESS);
4962: }