Actual source code: aijcusparse.cu
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 15: #include <thrust/adjacent_difference.h>
 16: #include <thrust/async/for_each.h>
 17: #include <thrust/iterator/constant_iterator.h>
 18: #include <thrust/remove.h>
 19: #include <thrust/sort.h>
 20: #include <thrust/unique.h>

 22: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 23: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 24:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 25:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 27:   typedef enum {
 28:       CUSPARSE_MV_ALG_DEFAULT = 0,
 29:       CUSPARSE_COOMV_ALG      = 1,
 30:       CUSPARSE_CSRMV_ALG1     = 2,
 31:       CUSPARSE_CSRMV_ALG2     = 3
 32:   } cusparseSpMVAlg_t;

 34:   typedef enum {
 35:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 36:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 37:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 38:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 39:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 40:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 41:       CUSPARSE_SPMM_COO_ALG1    = 1,
 42:       CUSPARSE_SPMM_COO_ALG2    = 2,
 43:       CUSPARSE_SPMM_COO_ALG3    = 3,
 44:       CUSPARSE_SPMM_COO_ALG4    = 5,
 45:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 46:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 47:   } cusparseSpMMAlg_t;

 49:   typedef enum {
 50:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 51:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 52:   } cusparseCsr2CscAlg_t;
 53:   */
 54:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 55:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 56:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 57: #endif

 59: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 61: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 86: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 88: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 89: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 91: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
 92: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
 93: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 95: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 96: {
 97:   *type = MATSOLVERCUSPARSE;
 98:   return 0;
 99: }

101: /*MC
102:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
103:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
104:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
105:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
106:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
107:   algorithms are not recommended. This class does NOT support direct solver operations.

109:   Level: beginner

111: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
112: M*/

114: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
115: {
116:   PetscInt       n = A->rmap->n;

118:   MatCreate(PetscObjectComm((PetscObject)A),B);
119:   MatSetSizes(*B,n,n,n,n);
120:   (*B)->factortype = ftype;
121:   MatSetType(*B,MATSEQAIJCUSPARSE);

123:   if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B,PETSC_TRUE);
124:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
125:     MatSetBlockSizesFromMats(*B,A,A);
126:     if (!A->boundtocpu) {
127:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
128:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
129:     } else {
130:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
131:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
132:     }
133:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
134:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
135:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
136:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
137:     if (!A->boundtocpu) {
138:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
139:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
140:     } else {
141:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
142:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
143:     }
144:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
145:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
146:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

148:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
149:   (*B)->canuseordering = PETSC_TRUE;
150:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
151:   return 0;
152: }

154: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155: {
156:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

158:   switch (op) {
159:   case MAT_CUSPARSE_MULT:
160:     cusparsestruct->format = format;
161:     break;
162:   case MAT_CUSPARSE_ALL:
163:     cusparsestruct->format = format;
164:     break;
165:   default:
166:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
167:   }
168:   return 0;
169: }

171: /*@
172:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
173:    operation. Only the MatMult operation can use different GPU storage formats
174:    for MPIAIJCUSPARSE matrices.
175:    Not Collective

177:    Input Parameters:
178: +  A - Matrix of type SEQAIJCUSPARSE
179: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
180: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

182:    Output Parameter:

184:    Level: intermediate

186: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
187: @*/
188: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
189: {
191:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
192:   return 0;
193: }

195: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
196: {
197:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

199:   cusparsestruct->use_cpu_solve = use_cpu;
200:   return 0;
201: }

203: /*@
204:    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.

206:    Input Parameters:
207: +  A - Matrix of type SEQAIJCUSPARSE
208: -  use_cpu - set flag for using the built-in CPU MatSolve

210:    Output Parameter:

212:    Notes:
213:    The cuSparse LU solver currently computes the factors with the built-in CPU method
214:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
215:    This method to specify if the solve is done on the CPU or GPU (GPU is the default).

217:    Level: intermediate

219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
222: {
224:   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
225:   return 0;
226: }

228: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
229: {
230:   switch (op) {
231:     case MAT_FORM_EXPLICIT_TRANSPOSE:
232:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
233:       if (A->form_explicit_transpose && !flg) MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
234:       A->form_explicit_transpose = flg;
235:       break;
236:     default:
237:       MatSetOption_SeqAIJ(A,op,flg);
238:       break;
239:   }
240:   return 0;
241: }

243: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

245: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
246: {
247:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
248:   IS             isrow = b->row,iscol = b->col;
249:   PetscBool      row_identity,col_identity;
250:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;

252:   MatSeqAIJCUSPARSECopyFromGPU(A);
253:   MatLUFactorNumeric_SeqAIJ(B,A,info);
254:   B->offloadmask = PETSC_OFFLOAD_CPU;
255:   /* determine which version of MatSolve needs to be used. */
256:   ISIdentity(isrow,&row_identity);
257:   ISIdentity(iscol,&col_identity);
258:   if (row_identity && col_identity) {
259:     if (!cusparsestruct->use_cpu_solve) {
260:       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
261:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
262:     }
263:     B->ops->matsolve = NULL;
264:     B->ops->matsolvetranspose = NULL;
265:   } else {
266:     if (!cusparsestruct->use_cpu_solve) {
267:       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
268:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
269:     }
270:     B->ops->matsolve = NULL;
271:     B->ops->matsolvetranspose = NULL;
272:   }

274:   /* get the triangular factors */
275:   if (!cusparsestruct->use_cpu_solve) {
276:     MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
277:   }
278:   return 0;
279: }

281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283:   PetscErrorCode           ierr;
284:   MatCUSPARSEStorageFormat format;
285:   PetscBool                flg;
286:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

288:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
289:   if (A->factortype == MAT_FACTOR_NONE) {
290:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
291:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
292:     if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);

294:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
295:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
296:     if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
297:     PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);
298:     if (flg) MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);
299: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303: #if CUSPARSE_VERSION > 11301
305: #else
307: #endif
308:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
309:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);

312:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
313:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
315:    #endif
316:   }
317:   PetscOptionsTail();
318:   return 0;
319: }

321: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
322: {
323:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;

325:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
326:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
327:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
328:   return 0;
329: }

331: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
332: {
333:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;

335:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
336:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
337:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338:   return 0;
339: }

341: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342: {
343:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;

345:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
346:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
347:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
348:   return 0;
349: }

351: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
352: {
353:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;

355:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
356:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
357:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
358:   return 0;
359: }

361: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
362: {
363:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
364:   PetscInt                          n = A->rmap->n;
365:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
366:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
367:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
368:   const MatScalar                   *aa = a->a,*v;
369:   PetscInt                          *AiLo, *AjLo;
370:   PetscInt                          i,nz, nzLower, offset, rowOffset;

372:   if (!n) return 0;
373:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
374:     try {
375:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
376:       nzLower=n+ai[n]-ai[1];
377:       if (!loTriFactor) {
378:         PetscScalar                       *AALo;

380:         cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));

382:         /* Allocate Space for the lower triangular matrix */
383:         cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));
384:         cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));

386:         /* Fill the lower triangular matrix */
387:         AiLo[0]  = (PetscInt) 0;
388:         AiLo[n]  = nzLower;
389:         AjLo[0]  = (PetscInt) 0;
390:         AALo[0]  = (MatScalar) 1.0;
391:         v        = aa;
392:         vi       = aj;
393:         offset   = 1;
394:         rowOffset= 1;
395:         for (i=1; i<n; i++) {
396:           nz = ai[i+1] - ai[i];
397:           /* additional 1 for the term on the diagonal */
398:           AiLo[i]    = rowOffset;
399:           rowOffset += nz+1;

401:           PetscArraycpy(&(AjLo[offset]), vi, nz);
402:           PetscArraycpy(&(AALo[offset]), v, nz);

404:           offset      += nz;
405:           AjLo[offset] = (PetscInt) i;
406:           AALo[offset] = (MatScalar) 1.0;
407:           offset      += 1;

409:           v  += nz;
410:           vi += nz;
411:         }

413:         /* allocate space for the triangular factor information */
414:         PetscNew(&loTriFactor);
415:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
416:         /* Create the matrix description */
417:         cusparseCreateMatDescr(&loTriFactor->descr);
418:         cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
419:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
420:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
421:        #else
422:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
423:        #endif
424:         cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);
425:         cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);

427:         /* set the operation */
428:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

430:         /* set the matrix */
431:         loTriFactor->csrMat = new CsrMatrix;
432:         loTriFactor->csrMat->num_rows = n;
433:         loTriFactor->csrMat->num_cols = n;
434:         loTriFactor->csrMat->num_entries = nzLower;

436:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

439:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
440:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

442:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
443:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

445:         /* Create the solve analysis information */
446:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
447:         cusparse_create_analysis_info(&loTriFactor->solveInfo);
448:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449:         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
450:                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
451:                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
452:                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
453:                                                &loTriFactor->solveBufferSize));
454:         cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);
455:       #endif

457:         /* perform the solve analysis */
458:         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
459:                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460:                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461:                                          loTriFactor->csrMat->column_indices->data().get(),
462:                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463:                                          loTriFactor->solveInfo,
464:                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
465:                                          #else
466:                                          loTriFactor->solveInfo));
467:                                          #endif
468:         WaitForCUDA();
469:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

471:         /* assign the pointer */
472:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
473:         loTriFactor->AA_h = AALo;
474:         cudaFreeHost(AiLo);
475:         cudaFreeHost(AjLo);
476:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
477:       } else { /* update values only */
478:         if (!loTriFactor->AA_h) {
479:           cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));
480:         }
481:         /* Fill the lower triangular matrix */
482:         loTriFactor->AA_h[0]  = 1.0;
483:         v        = aa;
484:         vi       = aj;
485:         offset   = 1;
486:         for (i=1; i<n; i++) {
487:           nz = ai[i+1] - ai[i];
488:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
489:           offset      += nz;
490:           loTriFactor->AA_h[offset] = 1.0;
491:           offset      += 1;
492:           v  += nz;
493:         }
494:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
495:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
496:       }
497:     } catch(char *ex) {
498:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
499:     }
500:   }
501:   return 0;
502: }

504: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
505: {
506:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
507:   PetscInt                          n = A->rmap->n;
508:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
509:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
510:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
511:   const MatScalar                   *aa = a->a,*v;
512:   PetscInt                          *AiUp, *AjUp;
513:   PetscInt                          i,nz, nzUpper, offset;

515:   if (!n) return 0;
516:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
517:     try {
518:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
519:       nzUpper = adiag[0]-adiag[n];
520:       if (!upTriFactor) {
521:         PetscScalar *AAUp;

523:         cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));

525:         /* Allocate Space for the upper triangular matrix */
526:         cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));
527:         cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));

529:         /* Fill the upper triangular matrix */
530:         AiUp[0]=(PetscInt) 0;
531:         AiUp[n]=nzUpper;
532:         offset = nzUpper;
533:         for (i=n-1; i>=0; i--) {
534:           v  = aa + adiag[i+1] + 1;
535:           vi = aj + adiag[i+1] + 1;

537:           /* number of elements NOT on the diagonal */
538:           nz = adiag[i] - adiag[i+1]-1;

540:           /* decrement the offset */
541:           offset -= (nz+1);

543:           /* first, set the diagonal elements */
544:           AjUp[offset] = (PetscInt) i;
545:           AAUp[offset] = (MatScalar)1./v[nz];
546:           AiUp[i]      = AiUp[i+1] - (nz+1);

548:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
549:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
550:         }

552:         /* allocate space for the triangular factor information */
553:         PetscNew(&upTriFactor);
554:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

556:         /* Create the matrix description */
557:         cusparseCreateMatDescr(&upTriFactor->descr);
558:         cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
559:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
560:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
561:        #else
562:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
563:        #endif
564:         cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
565:         cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);

567:         /* set the operation */
568:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

570:         /* set the matrix */
571:         upTriFactor->csrMat = new CsrMatrix;
572:         upTriFactor->csrMat->num_rows = n;
573:         upTriFactor->csrMat->num_cols = n;
574:         upTriFactor->csrMat->num_entries = nzUpper;

576:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
577:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

579:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
580:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

582:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
583:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

585:         /* Create the solve analysis information */
586:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
587:         cusparse_create_analysis_info(&upTriFactor->solveInfo);
588:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
589:         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
590:                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
591:                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
592:                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
593:                                                &upTriFactor->solveBufferSize));
594:         cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);
595:       #endif

597:         /* perform the solve analysis */
598:         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
599:                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
600:                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
601:                                          upTriFactor->csrMat->column_indices->data().get(),
602:                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
603:                                          upTriFactor->solveInfo,
604:                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
605:                                          #else
606:                                          upTriFactor->solveInfo));
607:                                          #endif
608:         WaitForCUDA();
609:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

611:         /* assign the pointer */
612:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
613:         upTriFactor->AA_h = AAUp;
614:         cudaFreeHost(AiUp);
615:         cudaFreeHost(AjUp);
616:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
617:       } else {
618:         if (!upTriFactor->AA_h) {
619:           cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));
620:         }
621:         /* Fill the upper triangular matrix */
622:         offset = nzUpper;
623:         for (i=n-1; i>=0; i--) {
624:           v  = aa + adiag[i+1] + 1;

626:           /* number of elements NOT on the diagonal */
627:           nz = adiag[i] - adiag[i+1]-1;

629:           /* decrement the offset */
630:           offset -= (nz+1);

632:           /* first, set the diagonal elements */
633:           upTriFactor->AA_h[offset] = 1./v[nz];
634:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
635:         }
636:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
637:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
638:       }
639:     } catch(char *ex) {
640:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
641:     }
642:   }
643:   return 0;
644: }

646: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
647: {
648:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
649:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
650:   IS                           isrow = a->row,iscol = a->icol;
651:   PetscBool                    row_identity,col_identity;
652:   PetscInt                     n = A->rmap->n;

655:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
656:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

658:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
659:   cusparseTriFactors->nnz=a->nz;

661:   A->offloadmask = PETSC_OFFLOAD_BOTH;
662:   /* lower triangular indices */
663:   ISIdentity(isrow,&row_identity);
664:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
665:     const PetscInt *r;

667:     ISGetIndices(isrow,&r);
668:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
669:     cusparseTriFactors->rpermIndices->assign(r, r+n);
670:     ISRestoreIndices(isrow,&r);
671:     PetscLogCpuToGpu(n*sizeof(PetscInt));
672:   }

674:   /* upper triangular indices */
675:   ISIdentity(iscol,&col_identity);
676:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
677:     const PetscInt *c;

679:     ISGetIndices(iscol,&c);
680:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
681:     cusparseTriFactors->cpermIndices->assign(c, c+n);
682:     ISRestoreIndices(iscol,&c);
683:     PetscLogCpuToGpu(n*sizeof(PetscInt));
684:   }
685:   return 0;
686: }

688: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
689: {
690:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
691:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
692:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
693:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
694:   PetscInt                          *AiUp, *AjUp;
695:   PetscScalar                       *AAUp;
696:   PetscScalar                       *AALo;
697:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
698:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
699:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
700:   const MatScalar                   *aa = b->a,*v;

702:   if (!n) return 0;
703:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
704:     try {
705:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));
706:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));
707:       if (!upTriFactor && !loTriFactor) {
708:         /* Allocate Space for the upper triangular matrix */
709:         cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));
710:         cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));

712:         /* Fill the upper triangular matrix */
713:         AiUp[0]=(PetscInt) 0;
714:         AiUp[n]=nzUpper;
715:         offset = 0;
716:         for (i=0; i<n; i++) {
717:           /* set the pointers */
718:           v  = aa + ai[i];
719:           vj = aj + ai[i];
720:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

722:           /* first, set the diagonal elements */
723:           AjUp[offset] = (PetscInt) i;
724:           AAUp[offset] = (MatScalar)1.0/v[nz];
725:           AiUp[i]      = offset;
726:           AALo[offset] = (MatScalar)1.0/v[nz];

728:           offset+=1;
729:           if (nz>0) {
730:             PetscArraycpy(&(AjUp[offset]), vj, nz);
731:             PetscArraycpy(&(AAUp[offset]), v, nz);
732:             for (j=offset; j<offset+nz; j++) {
733:               AAUp[j] = -AAUp[j];
734:               AALo[j] = AAUp[j]/v[nz];
735:             }
736:             offset+=nz;
737:           }
738:         }

740:         /* allocate space for the triangular factor information */
741:         PetscNew(&upTriFactor);
742:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

744:         /* Create the matrix description */
745:         cusparseCreateMatDescr(&upTriFactor->descr);
746:         cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
747:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
748:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
749:        #else
750:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
751:        #endif
752:         cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
753:         cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);

755:         /* set the matrix */
756:         upTriFactor->csrMat = new CsrMatrix;
757:         upTriFactor->csrMat->num_rows = A->rmap->n;
758:         upTriFactor->csrMat->num_cols = A->cmap->n;
759:         upTriFactor->csrMat->num_entries = a->nz;

761:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
762:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

764:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
765:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

767:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
768:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

770:         /* set the operation */
771:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

773:         /* Create the solve analysis information */
774:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
775:         cusparse_create_analysis_info(&upTriFactor->solveInfo);
776:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777:         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
778:                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
779:                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
780:                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
781:                                                &upTriFactor->solveBufferSize));
782:         cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);
783:       #endif

785:         /* perform the solve analysis */
786:         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
787:                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
788:                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
789:                                          upTriFactor->csrMat->column_indices->data().get(),
790:                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791:                                          upTriFactor->solveInfo,
792:                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
793:                                          #else
794:                                          upTriFactor->solveInfo));
795:                                          #endif
796:         WaitForCUDA();
797:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

799:         /* assign the pointer */
800:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

802:         /* allocate space for the triangular factor information */
803:         PetscNew(&loTriFactor);
804:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

806:         /* Create the matrix description */
807:         cusparseCreateMatDescr(&loTriFactor->descr);
808:         cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
809:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
810:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
811:        #else
812:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
813:        #endif
814:         cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
815:         cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);

817:         /* set the operation */
818:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

820:         /* set the matrix */
821:         loTriFactor->csrMat = new CsrMatrix;
822:         loTriFactor->csrMat->num_rows = A->rmap->n;
823:         loTriFactor->csrMat->num_cols = A->cmap->n;
824:         loTriFactor->csrMat->num_entries = a->nz;

826:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
827:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

829:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
830:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

832:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
833:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

835:         /* Create the solve analysis information */
836:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
837:         cusparse_create_analysis_info(&loTriFactor->solveInfo);
838:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
839:         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
840:                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
841:                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
842:                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
843:                                                &loTriFactor->solveBufferSize));
844:         cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);
845:       #endif

847:         /* perform the solve analysis */
848:         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
849:                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
850:                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
851:                                          loTriFactor->csrMat->column_indices->data().get(),
852:                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
853:                                          loTriFactor->solveInfo,
854:                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
855:                                          #else
856:                                          loTriFactor->solveInfo));
857:                                          #endif
858:         WaitForCUDA();
859:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

861:         /* assign the pointer */
862:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

864:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
865:         cudaFreeHost(AiUp);
866:         cudaFreeHost(AjUp);
867:       } else {
868:         /* Fill the upper triangular matrix */
869:         offset = 0;
870:         for (i=0; i<n; i++) {
871:           /* set the pointers */
872:           v  = aa + ai[i];
873:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

875:           /* first, set the diagonal elements */
876:           AAUp[offset] = 1.0/v[nz];
877:           AALo[offset] = 1.0/v[nz];

879:           offset+=1;
880:           if (nz>0) {
881:             PetscArraycpy(&(AAUp[offset]), v, nz);
882:             for (j=offset; j<offset+nz; j++) {
883:               AAUp[j] = -AAUp[j];
884:               AALo[j] = AAUp[j]/v[nz];
885:             }
886:             offset+=nz;
887:           }
888:         }
891:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
892:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
893:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
894:       }
895:       cudaFreeHost(AAUp);
896:       cudaFreeHost(AALo);
897:     } catch(char *ex) {
898:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
899:     }
900:   }
901:   return 0;
902: }

904: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
905: {
906:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
907:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
908:   IS                           ip = a->row;
909:   PetscBool                    perm_identity;
910:   PetscInt                     n = A->rmap->n;

913:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
914:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
915:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

917:   A->offloadmask = PETSC_OFFLOAD_BOTH;

919:   /* lower triangular indices */
920:   ISIdentity(ip,&perm_identity);
921:   if (!perm_identity) {
922:     IS             iip;
923:     const PetscInt *irip,*rip;

925:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
926:     ISGetIndices(iip,&irip);
927:     ISGetIndices(ip,&rip);
928:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
929:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
930:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
931:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
932:     ISRestoreIndices(iip,&irip);
933:     ISDestroy(&iip);
934:     ISRestoreIndices(ip,&rip);
935:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
936:   }
937:   return 0;
938: }

940: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
941: {
942:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
943:   IS             ip = b->row;
944:   PetscBool      perm_identity;

946:   MatSeqAIJCUSPARSECopyFromGPU(A);
947:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
948:   B->offloadmask = PETSC_OFFLOAD_CPU;
949:   /* determine which version of MatSolve needs to be used. */
950:   ISIdentity(ip,&perm_identity);
951:   if (perm_identity) {
952:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
953:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
954:     B->ops->matsolve = NULL;
955:     B->ops->matsolvetranspose = NULL;
956:   } else {
957:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
958:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
959:     B->ops->matsolve = NULL;
960:     B->ops->matsolvetranspose = NULL;
961:   }

963:   /* get the triangular factors */
964:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
965:   return 0;
966: }

968: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
969: {
970:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
971:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
972:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
973:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
974:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
975:   cusparseIndexBase_t               indexBase;
976:   cusparseMatrixType_t              matrixType;
977:   cusparseFillMode_t                fillMode;
978:   cusparseDiagType_t                diagType;

980:   /* allocate space for the transpose of the lower triangular factor */
981:   PetscNew(&loTriFactorT);
982:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

984:   /* set the matrix descriptors of the lower triangular factor */
985:   matrixType = cusparseGetMatType(loTriFactor->descr);
986:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
987:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
988:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
989:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

991:   /* Create the matrix description */
992:   cusparseCreateMatDescr(&loTriFactorT->descr);
993:   cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);
994:   cusparseSetMatType(loTriFactorT->descr, matrixType);
995:   cusparseSetMatFillMode(loTriFactorT->descr, fillMode);
996:   cusparseSetMatDiagType(loTriFactorT->descr, diagType);

998:   /* set the operation */
999:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1001:   /* allocate GPU space for the CSC of the lower triangular factor*/
1002:   loTriFactorT->csrMat = new CsrMatrix;
1003:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1004:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1005:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1006:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1007:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1008:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1010:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1011: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1012:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1013:                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1014:                                                loTriFactor->csrMat->values->data().get(),
1015:                                                loTriFactor->csrMat->row_offsets->data().get(),
1016:                                                loTriFactor->csrMat->column_indices->data().get(),
1017:                                                loTriFactorT->csrMat->values->data().get(),
1018:                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1019:                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1020:                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1021:   cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);
1022: #endif

1024:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1025:   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1026:                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1027:                                   loTriFactor->csrMat->values->data().get(),
1028:                                   loTriFactor->csrMat->row_offsets->data().get(),
1029:                                   loTriFactor->csrMat->column_indices->data().get(),
1030:                                   loTriFactorT->csrMat->values->data().get(),
1031:                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1032:                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1033:                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1034:                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1035:                                   #else
1036:                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1037:                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1038:                                   #endif
1039:   WaitForCUDA();
1040:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1042:   /* Create the solve analysis information */
1043:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1044:   cusparse_create_analysis_info(&loTriFactorT->solveInfo);
1045: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1046:   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1047:                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1048:                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1049:                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1050:                                          &loTriFactorT->solveBufferSize));
1051:   cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);
1052: #endif

1054:   /* perform the solve analysis */
1055:   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1056:                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1057:                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1058:                                    loTriFactorT->csrMat->column_indices->data().get(),
1059:                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1060:                                    loTriFactorT->solveInfo,
1061:                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1062:                                    #else
1063:                                    loTriFactorT->solveInfo));
1064:                                    #endif
1065:   WaitForCUDA();
1066:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1068:   /* assign the pointer */
1069:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1071:   /*********************************************/
1072:   /* Now the Transpose of the Upper Tri Factor */
1073:   /*********************************************/

1075:   /* allocate space for the transpose of the upper triangular factor */
1076:   PetscNew(&upTriFactorT);
1077:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1079:   /* set the matrix descriptors of the upper triangular factor */
1080:   matrixType = cusparseGetMatType(upTriFactor->descr);
1081:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1082:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1083:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1084:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1086:   /* Create the matrix description */
1087:   cusparseCreateMatDescr(&upTriFactorT->descr);
1088:   cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);
1089:   cusparseSetMatType(upTriFactorT->descr, matrixType);
1090:   cusparseSetMatFillMode(upTriFactorT->descr, fillMode);
1091:   cusparseSetMatDiagType(upTriFactorT->descr, diagType);

1093:   /* set the operation */
1094:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1096:   /* allocate GPU space for the CSC of the upper triangular factor*/
1097:   upTriFactorT->csrMat = new CsrMatrix;
1098:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1099:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1100:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1101:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1102:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1103:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1105:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1106: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1107:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1108:                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1109:                                                upTriFactor->csrMat->values->data().get(),
1110:                                                upTriFactor->csrMat->row_offsets->data().get(),
1111:                                                upTriFactor->csrMat->column_indices->data().get(),
1112:                                                upTriFactorT->csrMat->values->data().get(),
1113:                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1114:                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1115:                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1116:   cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);
1117: #endif

1119:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1120:   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1121:                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1122:                                   upTriFactor->csrMat->values->data().get(),
1123:                                   upTriFactor->csrMat->row_offsets->data().get(),
1124:                                   upTriFactor->csrMat->column_indices->data().get(),
1125:                                   upTriFactorT->csrMat->values->data().get(),
1126:                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1127:                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1128:                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1129:                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1130:                                   #else
1131:                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1132:                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1133:                                  #endif

1135:   WaitForCUDA();
1136:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1138:   /* Create the solve analysis information */
1139:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1140:   cusparse_create_analysis_info(&upTriFactorT->solveInfo);
1141:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1142:   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1143:                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1144:                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1145:                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1146:                                          &upTriFactorT->solveBufferSize));
1147:   cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);
1148:   #endif

1150:   /* perform the solve analysis */
1151:   /* christ, would it have killed you to put this stuff in a function????????? */
1152:   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1153:                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1154:                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1155:                                    upTriFactorT->csrMat->column_indices->data().get(),
1156:                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1157:                                    upTriFactorT->solveInfo,
1158:                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1159:                                    #else
1160:                                    upTriFactorT->solveInfo));
1161:                                    #endif

1163:   WaitForCUDA();
1164:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1166:   /* assign the pointer */
1167:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1168:   return 0;
1169: }

1171: struct PetscScalarToPetscInt
1172: {
1173:   __host__ __device__
1174:   PetscInt operator()(PetscScalar s)
1175:   {
1176:     return (PetscInt)PetscRealPart(s);
1177:   }
1178: };

1180: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1181: {
1182:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1183:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1184:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1185:   cusparseStatus_t             stat;
1186:   cusparseIndexBase_t          indexBase;

1188:   MatSeqAIJCUSPARSECopyToGPU(A);
1189:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1191:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1193:   if (A->transupdated) return 0;
1194:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1195:   PetscLogGpuTimeBegin();
1196:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1197:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1198:   }
1199:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1200:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1201:     cusparseCreateMatDescr(&matstructT->descr);
1202:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1203:     cusparseSetMatIndexBase(matstructT->descr, indexBase);
1204:     cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);

1206:     /* set alpha and beta */
1207:     cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));
1208:     cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));
1209:     cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
1210:     cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1211:     cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
1212:     cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);

1214:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1215:       CsrMatrix *matrixT = new CsrMatrix;
1216:       matstructT->mat = matrixT;
1217:       matrixT->num_rows = A->cmap->n;
1218:       matrixT->num_cols = A->rmap->n;
1219:       matrixT->num_entries = a->nz;
1220:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1221:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1222:       matrixT->values = new THRUSTARRAY(a->nz);

1224:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1225:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1227:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1228:       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1229:         stat = cusparseCreateCsr(&matstructT->matDescr,
1230:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1231:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1232:                                matrixT->values->data().get(),
1233:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1234:                                indexBase,cusparse_scalartype);stat;
1235:       #else
1236:         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1237:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1239:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1240:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1241:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1242:         */
1243:         if (matrixT->num_entries) {
1244:           stat = cusparseCreateCsr(&matstructT->matDescr,
1245:                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1246:                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1247:                                  matrixT->values->data().get(),
1248:                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1249:                                  indexBase,cusparse_scalartype);stat;

1251:         } else {
1252:           matstructT->matDescr = NULL;
1253:           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1254:         }
1255:       #endif
1256:      #endif
1257:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1258:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1260:    #else
1261:       CsrMatrix *temp  = new CsrMatrix;
1262:       CsrMatrix *tempT = new CsrMatrix;
1263:       /* First convert HYB to CSR */
1264:       temp->num_rows = A->rmap->n;
1265:       temp->num_cols = A->cmap->n;
1266:       temp->num_entries = a->nz;
1267:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1268:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1269:       temp->values = new THRUSTARRAY(a->nz);

1271:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1272:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1273:                               temp->values->data().get(),
1274:                               temp->row_offsets->data().get(),
1275:                               temp->column_indices->data().get());stat;

1277:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1278:       tempT->num_rows = A->rmap->n;
1279:       tempT->num_cols = A->cmap->n;
1280:       tempT->num_entries = a->nz;
1281:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1282:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1283:       tempT->values = new THRUSTARRAY(a->nz);

1285:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1286:                               temp->num_cols, temp->num_entries,
1287:                               temp->values->data().get(),
1288:                               temp->row_offsets->data().get(),
1289:                               temp->column_indices->data().get(),
1290:                               tempT->values->data().get(),
1291:                               tempT->column_indices->data().get(),
1292:                               tempT->row_offsets->data().get(),
1293:                               CUSPARSE_ACTION_NUMERIC, indexBase);stat;

1295:       /* Last, convert CSC to HYB */
1296:       cusparseHybMat_t hybMat;
1297:       cusparseCreateHybMat(&hybMat);
1298:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1299:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1300:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1301:                               matstructT->descr, tempT->values->data().get(),
1302:                               tempT->row_offsets->data().get(),
1303:                               tempT->column_indices->data().get(),
1304:                               hybMat, 0, partition);stat;

1306:       /* assign the pointer */
1307:       matstructT->mat = hybMat;
1308:       A->transupdated = PETSC_TRUE;
1309:       /* delete temporaries */
1310:       if (tempT) {
1311:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1312:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1313:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1314:         delete (CsrMatrix*) tempT;
1315:       }
1316:       if (temp) {
1317:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1318:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1319:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1320:         delete (CsrMatrix*) temp;
1321:       }
1322:      #endif
1323:     }
1324:   }
1325:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1326:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1327:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1336:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1337:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1338:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1339:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1340:     }
1341:     if (!cusparsestruct->csr2csc_i) {
1342:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1343:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1345:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1346:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1347:       void   *csr2cscBuffer;
1348:       size_t csr2cscBufferSize;
1349:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1350:                                            A->cmap->n, matrix->num_entries,
1351:                                            matrix->values->data().get(),
1352:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1353:                                            matrix->column_indices->data().get(),
1354:                                            matrixT->values->data().get(),
1355:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1356:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1357:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);stat;
1358:       cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);
1359:      #endif

1361:       if (matrix->num_entries) {
1362:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1363:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1364:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1366:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1367:            should be filled with indexBase. So I just take a shortcut here.
1368:         */
1369:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1370:                               A->cmap->n,matrix->num_entries,
1371:                               csr2csc_a.data().get(),
1372:                               cusparsestruct->rowoffsets_gpu->data().get(),
1373:                               matrix->column_indices->data().get(),
1374:                               matrixT->values->data().get(),
1375:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1376:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1377:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1378:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);stat;
1379:                              #else
1380:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1381:                               CUSPARSE_ACTION_NUMERIC, indexBase);stat;
1382:                              #endif
1383:       } else {
1384:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1385:       }

1387:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1388:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1389:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1390:       cudaFree(csr2cscBuffer);
1391:      #endif
1392:     }
1393:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1394:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1395:                                                      matrixT->values->begin()));
1396:   }
1397:   PetscLogGpuTimeEnd();
1398:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1399:   /* the compressed row indices is not used for matTranspose */
1400:   matstructT->cprowIndices = NULL;
1401:   /* assign the pointer */
1402:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1403:   A->transupdated = PETSC_TRUE;
1404:   return 0;
1405: }

1407: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1408: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1409: {
1410:   PetscInt                              n = xx->map->n;
1411:   const PetscScalar                     *barray;
1412:   PetscScalar                           *xarray;
1413:   thrust::device_ptr<const PetscScalar> bGPU;
1414:   thrust::device_ptr<PetscScalar>       xGPU;
1415:   cusparseStatus_t                      stat;
1416:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1417:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1418:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1419:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;

1421:   /* Analyze the matrix and create the transpose ... on the fly */
1422:   if (!loTriFactorT && !upTriFactorT) {
1423:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1424:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426:   }

1428:   /* Get the GPU pointers */
1429:   VecCUDAGetArrayWrite(xx,&xarray);
1430:   VecCUDAGetArrayRead(bb,&barray);
1431:   xGPU = thrust::device_pointer_cast(xarray);
1432:   bGPU = thrust::device_pointer_cast(barray);

1434:   PetscLogGpuTimeBegin();
1435:   /* First, reorder with the row permutation */
1436:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1437:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1438:                xGPU);

1440:   /* First, solve U */
1441:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1442:                         upTriFactorT->csrMat->num_rows,
1443:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444:                         upTriFactorT->csrMat->num_entries,
1445:                       #endif
1446:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1447:                         upTriFactorT->csrMat->values->data().get(),
1448:                         upTriFactorT->csrMat->row_offsets->data().get(),
1449:                         upTriFactorT->csrMat->column_indices->data().get(),
1450:                         upTriFactorT->solveInfo,
1451:                         xarray,
1452:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453:                         tempGPU->data().get(),
1454:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);stat;
1455:                       #else
1456:                         tempGPU->data().get());stat;
1457:                       #endif

1459:   /* Then, solve L */
1460:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1461:                         loTriFactorT->csrMat->num_rows,
1462:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1463:                         loTriFactorT->csrMat->num_entries,
1464:                       #endif
1465:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1466:                         loTriFactorT->csrMat->values->data().get(),
1467:                         loTriFactorT->csrMat->row_offsets->data().get(),
1468:                         loTriFactorT->csrMat->column_indices->data().get(),
1469:                         loTriFactorT->solveInfo,
1470:                         tempGPU->data().get(),
1471:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472:                         xarray,
1473:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);stat;
1474:                       #else
1475:                          xarray);stat;
1476:                       #endif

1478:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1479:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1480:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1481:                tempGPU->begin());

1483:   /* Copy the temporary to the full solution. */
1484:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1486:   /* restore */
1487:   VecCUDARestoreArrayRead(bb,&barray);
1488:   VecCUDARestoreArrayWrite(xx,&xarray);
1489:   PetscLogGpuTimeEnd();
1490:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1491:   return 0;
1492: }

1494: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1495: {
1496:   const PetscScalar                 *barray;
1497:   PetscScalar                       *xarray;
1498:   cusparseStatus_t                  stat;
1499:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1500:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1501:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1502:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;

1504:   /* Analyze the matrix and create the transpose ... on the fly */
1505:   if (!loTriFactorT && !upTriFactorT) {
1506:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1507:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509:   }

1511:   /* Get the GPU pointers */
1512:   VecCUDAGetArrayWrite(xx,&xarray);
1513:   VecCUDAGetArrayRead(bb,&barray);

1515:   PetscLogGpuTimeBegin();
1516:   /* First, solve U */
1517:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1518:                         upTriFactorT->csrMat->num_rows,
1519:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520:                         upTriFactorT->csrMat->num_entries,
1521:                       #endif
1522:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1523:                         upTriFactorT->csrMat->values->data().get(),
1524:                         upTriFactorT->csrMat->row_offsets->data().get(),
1525:                         upTriFactorT->csrMat->column_indices->data().get(),
1526:                         upTriFactorT->solveInfo,
1527:                         barray,
1528:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1529:                         tempGPU->data().get(),
1530:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);stat;
1531:                       #else
1532:                         tempGPU->data().get());stat;
1533:                       #endif

1535:   /* Then, solve L */
1536:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1537:                         loTriFactorT->csrMat->num_rows,
1538:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539:                         loTriFactorT->csrMat->num_entries,
1540:                       #endif
1541:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1542:                         loTriFactorT->csrMat->values->data().get(),
1543:                         loTriFactorT->csrMat->row_offsets->data().get(),
1544:                         loTriFactorT->csrMat->column_indices->data().get(),
1545:                         loTriFactorT->solveInfo,
1546:                         tempGPU->data().get(),
1547:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548:                         xarray,
1549:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);stat;
1550:                       #else
1551:                         xarray);stat;
1552:                       #endif

1554:   /* restore */
1555:   VecCUDARestoreArrayRead(bb,&barray);
1556:   VecCUDARestoreArrayWrite(xx,&xarray);
1557:   PetscLogGpuTimeEnd();
1558:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1559:   return 0;
1560: }

1562: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1563: {
1564:   const PetscScalar                     *barray;
1565:   PetscScalar                           *xarray;
1566:   thrust::device_ptr<const PetscScalar> bGPU;
1567:   thrust::device_ptr<PetscScalar>       xGPU;
1568:   cusparseStatus_t                      stat;
1569:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1570:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1571:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1572:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;


1575:   /* Get the GPU pointers */
1576:   VecCUDAGetArrayWrite(xx,&xarray);
1577:   VecCUDAGetArrayRead(bb,&barray);
1578:   xGPU = thrust::device_pointer_cast(xarray);
1579:   bGPU = thrust::device_pointer_cast(barray);

1581:   PetscLogGpuTimeBegin();
1582:   /* First, reorder with the row permutation */
1583:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1584:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1585:                tempGPU->begin());

1587:   /* Next, solve L */
1588:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1589:                         loTriFactor->csrMat->num_rows,
1590:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591:                         loTriFactor->csrMat->num_entries,
1592:                       #endif
1593:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1594:                         loTriFactor->csrMat->values->data().get(),
1595:                         loTriFactor->csrMat->row_offsets->data().get(),
1596:                         loTriFactor->csrMat->column_indices->data().get(),
1597:                         loTriFactor->solveInfo,
1598:                         tempGPU->data().get(),
1599:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1600:                          xarray,
1601:                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);stat;
1602:                       #else
1603:                          xarray);stat;
1604:                       #endif

1606:   /* Then, solve U */
1607:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1608:                         upTriFactor->csrMat->num_rows,
1609:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610:                         upTriFactor->csrMat->num_entries,
1611:                       #endif
1612:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1613:                         upTriFactor->csrMat->values->data().get(),
1614:                         upTriFactor->csrMat->row_offsets->data().get(),
1615:                         upTriFactor->csrMat->column_indices->data().get(),
1616:                         upTriFactor->solveInfo,xarray,
1617:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1618:                         tempGPU->data().get(),
1619:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);stat;
1620:                       #else
1621:                         tempGPU->data().get());stat;
1622:                       #endif

1624:   /* Last, reorder with the column permutation */
1625:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1626:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1627:                xGPU);

1629:   VecCUDARestoreArrayRead(bb,&barray);
1630:   VecCUDARestoreArrayWrite(xx,&xarray);
1631:   PetscLogGpuTimeEnd();
1632:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1633:   return 0;
1634: }

1636: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1637: {
1638:   const PetscScalar                 *barray;
1639:   PetscScalar                       *xarray;
1640:   cusparseStatus_t                  stat;
1641:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1642:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1643:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1644:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;

1646:   /* Get the GPU pointers */
1647:   VecCUDAGetArrayWrite(xx,&xarray);
1648:   VecCUDAGetArrayRead(bb,&barray);

1650:   PetscLogGpuTimeBegin();
1651:   /* First, solve L */
1652:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1653:                         loTriFactor->csrMat->num_rows,
1654:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655:                         loTriFactor->csrMat->num_entries,
1656:                       #endif
1657:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1658:                         loTriFactor->csrMat->values->data().get(),
1659:                         loTriFactor->csrMat->row_offsets->data().get(),
1660:                         loTriFactor->csrMat->column_indices->data().get(),
1661:                         loTriFactor->solveInfo,
1662:                         barray,
1663:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1664:                         tempGPU->data().get(),
1665:                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);stat;
1666:                       #else
1667:                         tempGPU->data().get());stat;
1668:                       #endif

1670:   /* Next, solve U */
1671:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1672:                         upTriFactor->csrMat->num_rows,
1673:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1674:                         upTriFactor->csrMat->num_entries,
1675:                       #endif
1676:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1677:                         upTriFactor->csrMat->values->data().get(),
1678:                         upTriFactor->csrMat->row_offsets->data().get(),
1679:                         upTriFactor->csrMat->column_indices->data().get(),
1680:                         upTriFactor->solveInfo,
1681:                         tempGPU->data().get(),
1682:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1683:                         xarray,
1684:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);stat;
1685:                       #else
1686:                         xarray);stat;
1687:                       #endif

1689:   VecCUDARestoreArrayRead(bb,&barray);
1690:   VecCUDARestoreArrayWrite(xx,&xarray);
1691:   PetscLogGpuTimeEnd();
1692:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1693:   return 0;
1694: }

1696: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1697: {
1698:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1699:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

1701:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1702:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1704:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1705:     cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1706:     WaitForCUDA();
1707:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1708:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1709:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1710:   }
1711:   return 0;
1712: }

1714: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1715: {
1716:   MatSeqAIJCUSPARSECopyFromGPU(A);
1717:   *array = ((Mat_SeqAIJ*)A->data)->a;
1718:   return 0;
1719: }

1721: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1722: {
1723:   A->offloadmask = PETSC_OFFLOAD_CPU;
1724:   *array         = NULL;
1725:   return 0;
1726: }

1728: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1729: {
1730:   MatSeqAIJCUSPARSECopyFromGPU(A);
1731:   *array = ((Mat_SeqAIJ*)A->data)->a;
1732:   return 0;
1733: }

1735: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1736: {
1737:   *array = NULL;
1738:   return 0;
1739: }

1741: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1742: {
1743:   *array = ((Mat_SeqAIJ*)A->data)->a;
1744:   return 0;
1745: }

1747: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1748: {
1749:   A->offloadmask = PETSC_OFFLOAD_CPU;
1750:   *array         = NULL;
1751:   return 0;
1752: }

1754: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1755: {
1756:   Mat_SeqAIJCUSPARSE           *cusp;
1757:   CsrMatrix                    *matrix;

1759:   MatSeqAIJCUSPARSECopyToGPU(A);
1761:   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1763:   matrix = (CsrMatrix*)cusp->mat->mat;

1765:   if (i) {
1766:    #if !defined(PETSC_USE_64BIT_INDICES)
1767:     *i = matrix->row_offsets->data().get();
1768:    #else
1769:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1770:    #endif
1771:   }
1772:   if (j) {
1773:    #if !defined(PETSC_USE_64BIT_INDICES)
1774:     *j = matrix->column_indices->data().get();
1775:    #else
1776:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1777:    #endif
1778:   }
1779:   if (a) *a = matrix->values->data().get();
1780:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1781:   return 0;
1782: }

1784: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1785: {
1786:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1787:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1788:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1789:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1790:   cusparseStatus_t             stat;
1791:   PetscBool                    both = PETSC_TRUE;

1794:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1795:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1796:       CsrMatrix *matrix;
1797:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1800:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1801:       matrix->values->assign(a->a, a->a+a->nz);
1802:       WaitForCUDA();
1803:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1804:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1805:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1806:     } else {
1807:       PetscInt nnz;
1808:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1809:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1810:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1811:       delete cusparsestruct->workVector;
1812:       delete cusparsestruct->rowoffsets_gpu;
1813:       cusparsestruct->workVector = NULL;
1814:       cusparsestruct->rowoffsets_gpu = NULL;
1815:       try {
1816:         if (a->compressedrow.use) {
1817:           m    = a->compressedrow.nrows;
1818:           ii   = a->compressedrow.i;
1819:           ridx = a->compressedrow.rindex;
1820:         } else {
1821:           m    = A->rmap->n;
1822:           ii   = a->i;
1823:           ridx = NULL;
1824:         }
1826:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1827:         else nnz = a->nz;

1830:         /* create cusparse matrix */
1831:         cusparsestruct->nrows = m;
1832:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1833:         cusparseCreateMatDescr(&matstruct->descr);
1834:         cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);
1835:         cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);

1837:         cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));
1838:         cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));
1839:         cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
1840:         cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1841:         cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
1842:         cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
1843:         cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);

1845:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1846:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1847:           /* set the matrix */
1848:           CsrMatrix *mat= new CsrMatrix;
1849:           mat->num_rows = m;
1850:           mat->num_cols = A->cmap->n;
1851:           mat->num_entries = nnz;
1852:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1853:           mat->row_offsets->assign(ii, ii + m+1);

1855:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1856:           mat->column_indices->assign(a->j, a->j+nnz);

1858:           mat->values = new THRUSTARRAY(nnz);
1859:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1861:           /* assign the pointer */
1862:           matstruct->mat = mat;
1863:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1864:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1865:             stat = cusparseCreateCsr(&matstruct->matDescr,
1866:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1867:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1868:                                     mat->values->data().get(),
1869:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1870:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);stat;
1871:           }
1872:          #endif
1873:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1874:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1875:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1876:          #else
1877:           CsrMatrix *mat= new CsrMatrix;
1878:           mat->num_rows = m;
1879:           mat->num_cols = A->cmap->n;
1880:           mat->num_entries = nnz;
1881:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1882:           mat->row_offsets->assign(ii, ii + m+1);

1884:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1885:           mat->column_indices->assign(a->j, a->j+nnz);

1887:           mat->values = new THRUSTARRAY(nnz);
1888:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1890:           cusparseHybMat_t hybMat;
1891:           cusparseCreateHybMat(&hybMat);
1892:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1893:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1894:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1895:               matstruct->descr, mat->values->data().get(),
1896:               mat->row_offsets->data().get(),
1897:               mat->column_indices->data().get(),
1898:               hybMat, 0, partition);stat;
1899:           /* assign the pointer */
1900:           matstruct->mat = hybMat;

1902:           if (mat) {
1903:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1904:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1905:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1906:             delete (CsrMatrix*)mat;
1907:           }
1908:          #endif
1909:         }

1911:         /* assign the compressed row indices */
1912:         if (a->compressedrow.use) {
1913:           cusparsestruct->workVector = new THRUSTARRAY(m);
1914:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1915:           matstruct->cprowIndices->assign(ridx,ridx+m);
1916:           tmp = m;
1917:         } else {
1918:           cusparsestruct->workVector = NULL;
1919:           matstruct->cprowIndices    = NULL;
1920:           tmp = 0;
1921:         }
1922:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1924:         /* assign the pointer */
1925:         cusparsestruct->mat = matstruct;
1926:       } catch(char *ex) {
1927:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1928:       }
1929:       WaitForCUDA();
1930:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1931:       cusparsestruct->nonzerostate = A->nonzerostate;
1932:     }
1933:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1934:   }
1935:   return 0;
1936: }

1938: struct VecCUDAPlusEquals
1939: {
1940:   template <typename Tuple>
1941:   __host__ __device__
1942:   void operator()(Tuple t)
1943:   {
1944:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1945:   }
1946: };

1948: struct VecCUDAEquals
1949: {
1950:   template <typename Tuple>
1951:   __host__ __device__
1952:   void operator()(Tuple t)
1953:   {
1954:     thrust::get<1>(t) = thrust::get<0>(t);
1955:   }
1956: };

1958: struct VecCUDAEqualsReverse
1959: {
1960:   template <typename Tuple>
1961:   __host__ __device__
1962:   void operator()(Tuple t)
1963:   {
1964:     thrust::get<0>(t) = thrust::get<1>(t);
1965:   }
1966: };

1968: struct MatMatCusparse {
1969:   PetscBool             cisdense;
1970:   PetscScalar           *Bt;
1971:   Mat                   X;
1972:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1973:   PetscLogDouble        flops;
1974:   CsrMatrix             *Bcsr;

1976: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1977:   cusparseSpMatDescr_t  matSpBDescr;
1978:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1979:   cusparseDnMatDescr_t  matBDescr;
1980:   cusparseDnMatDescr_t  matCDescr;
1981:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1982:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1983:   void                  *dBuffer4;
1984:   void                  *dBuffer5;
1985:  #endif
1986:   size_t                mmBufferSize;
1987:   void                  *mmBuffer;
1988:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1989:   cusparseSpGEMMDescr_t spgemmDesc;
1990: #endif
1991: };

1993: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1994: {
1995:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

1997:   cudaFree(mmdata->Bt);
1998:   delete mmdata->Bcsr;
1999:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2000:   if (mmdata->matSpBDescr) cusparseDestroySpMat(mmdata->matSpBDescr);
2001:   if (mmdata->matBDescr)   cusparseDestroyDnMat(mmdata->matBDescr);
2002:   if (mmdata->matCDescr)   cusparseDestroyDnMat(mmdata->matCDescr);
2003:   if (mmdata->spgemmDesc)  cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2004:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2005:   if (mmdata->dBuffer4)  cudaFree(mmdata->dBuffer4);
2006:   if (mmdata->dBuffer5)  cudaFree(mmdata->dBuffer5);
2007:  #endif
2008:   if (mmdata->mmBuffer)  cudaFree(mmdata->mmBuffer);
2009:   if (mmdata->mmBuffer2) cudaFree(mmdata->mmBuffer2);
2010:  #endif
2011:   MatDestroy(&mmdata->X);
2012:   PetscFree(data);
2013:   return 0;
2014: }

2016: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

2018: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2019: {
2020:   Mat_Product                  *product = C->product;
2021:   Mat                          A,B;
2022:   PetscInt                     m,n,blda,clda;
2023:   PetscBool                    flg,biscuda;
2024:   Mat_SeqAIJCUSPARSE           *cusp;
2025:   cusparseStatus_t             stat;
2026:   cusparseOperation_t          opA;
2027:   const PetscScalar            *barray;
2028:   PetscScalar                  *carray;
2029:   MatMatCusparse               *mmdata;
2030:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2031:   CsrMatrix                    *csrmat;

2033:   MatCheckProduct(C,1);
2035:   mmdata = (MatMatCusparse*)product->data;
2036:   A    = product->A;
2037:   B    = product->B;
2038:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2040:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2041:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2043:   MatSeqAIJCUSPARSECopyToGPU(A);
2044:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2045:   switch (product->type) {
2046:   case MATPRODUCT_AB:
2047:   case MATPRODUCT_PtAP:
2048:     mat = cusp->mat;
2049:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2050:     m   = A->rmap->n;
2051:     n   = B->cmap->n;
2052:     break;
2053:   case MATPRODUCT_AtB:
2054:     if (!A->form_explicit_transpose) {
2055:       mat = cusp->mat;
2056:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2057:     } else {
2058:       MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2059:       mat  = cusp->matTranspose;
2060:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2061:     }
2062:     m = A->cmap->n;
2063:     n = B->cmap->n;
2064:     break;
2065:   case MATPRODUCT_ABt:
2066:   case MATPRODUCT_RARt:
2067:     mat = cusp->mat;
2068:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2069:     m   = A->rmap->n;
2070:     n   = B->rmap->n;
2071:     break;
2072:   default:
2073:     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2074:   }
2076:   csrmat = (CsrMatrix*)mat->mat;
2077:   /* if the user passed a CPU matrix, copy the data to the GPU */
2078:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2079:   if (!biscuda) MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
2080:   MatDenseCUDAGetArrayRead(B,&barray);

2082:   MatDenseGetLDA(B,&blda);
2083:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2084:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2085:     MatDenseGetLDA(mmdata->X,&clda);
2086:   } else {
2087:     MatDenseCUDAGetArrayWrite(C,&carray);
2088:     MatDenseGetLDA(C,&clda);
2089:   }

2091:   PetscLogGpuTimeBegin();
2092:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2093:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2094:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2095:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2096:     size_t mmBufferSize;
2097:     if (mmdata->initialized && mmdata->Blda != blda) {cusparseDestroyDnMat(mmdata->matBDescr); mmdata->matBDescr = NULL;}
2098:     if (!mmdata->matBDescr) {
2099:       cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);
2100:       mmdata->Blda = blda;
2101:     }

2103:     if (mmdata->initialized && mmdata->Clda != clda) {cusparseDestroyDnMat(mmdata->matCDescr); mmdata->matCDescr = NULL;}
2104:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2105:       cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);
2106:       mmdata->Clda = clda;
2107:     }

2109:     if (!mat->matDescr) {
2110:       stat = cusparseCreateCsr(&mat->matDescr,
2111:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2112:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2113:                                csrmat->values->data().get(),
2114:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2115:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);stat;
2116:     }
2117:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2118:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2119:                                    mmdata->matCDescr,cusparse_scalartype,
2120:                                    cusp->spmmAlg,&mmBufferSize);stat;
2121:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2122:       cudaFree(mmdata->mmBuffer);
2123:       cudaMalloc(&mmdata->mmBuffer,mmBufferSize);
2124:       mmdata->mmBufferSize = mmBufferSize;
2125:     }
2126:     mmdata->initialized = PETSC_TRUE;
2127:   } else {
2128:     /* to be safe, always update pointers of the mats */
2129:     cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());
2130:     cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);
2131:     cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);
2132:   }

2134:   /* do cusparseSpMM, which supports transpose on B */
2135:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2136:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2137:                       mmdata->matCDescr,cusparse_scalartype,
2138:                       cusp->spmmAlg,mmdata->mmBuffer);stat;
2139:  #else
2140:   PetscInt k;
2141:   /* cusparseXcsrmm does not support transpose on B */
2142:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2143:     cublasHandle_t cublasv2handle;
2144:     cublasStatus_t cerr;

2146:     PetscCUBLASGetHandle(&cublasv2handle);
2147:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2148:                        B->cmap->n,B->rmap->n,
2149:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2150:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2151:                        mmdata->Bt,B->cmap->n);cerr;
2152:     blda = B->cmap->n;
2153:     k    = B->cmap->n;
2154:   } else {
2155:     k    = B->rmap->n;
2156:   }

2158:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2159:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2160:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2161:                            csrmat->values->data().get(),
2162:                            csrmat->row_offsets->data().get(),
2163:                            csrmat->column_indices->data().get(),
2164:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2165:                            carray,clda);stat;
2166:  #endif
2167:   PetscLogGpuTimeEnd();
2168:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2169:   MatDenseCUDARestoreArrayRead(B,&barray);
2170:   if (product->type == MATPRODUCT_RARt) {
2171:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2172:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2173:   } else if (product->type == MATPRODUCT_PtAP) {
2174:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2175:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2176:   } else {
2177:     MatDenseCUDARestoreArrayWrite(C,&carray);
2178:   }
2179:   if (mmdata->cisdense) {
2180:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2181:   }
2182:   if (!biscuda) {
2183:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2184:   }
2185:   return 0;
2186: }

2188: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2189: {
2190:   Mat_Product        *product = C->product;
2191:   Mat                A,B;
2192:   PetscInt           m,n;
2193:   PetscBool          cisdense,flg;
2194:   MatMatCusparse     *mmdata;
2195:   Mat_SeqAIJCUSPARSE *cusp;

2197:   MatCheckProduct(C,1);
2199:   A    = product->A;
2200:   B    = product->B;
2201:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2203:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2205:   switch (product->type) {
2206:   case MATPRODUCT_AB:
2207:     m = A->rmap->n;
2208:     n = B->cmap->n;
2209:     break;
2210:   case MATPRODUCT_AtB:
2211:     m = A->cmap->n;
2212:     n = B->cmap->n;
2213:     break;
2214:   case MATPRODUCT_ABt:
2215:     m = A->rmap->n;
2216:     n = B->rmap->n;
2217:     break;
2218:   case MATPRODUCT_PtAP:
2219:     m = B->cmap->n;
2220:     n = B->cmap->n;
2221:     break;
2222:   case MATPRODUCT_RARt:
2223:     m = B->rmap->n;
2224:     n = B->rmap->n;
2225:     break;
2226:   default:
2227:     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2228:   }
2229:   MatSetSizes(C,m,n,m,n);
2230:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2231:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2232:   MatSetType(C,MATSEQDENSECUDA);

2234:   /* product data */
2235:   PetscNew(&mmdata);
2236:   mmdata->cisdense = cisdense;
2237:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2238:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2239:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2240:     cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));
2241:   }
2242:  #endif
2243:   /* for these products we need intermediate storage */
2244:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2245:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2246:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2247:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2248:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2249:     } else {
2250:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2251:     }
2252:   }
2253:   C->product->data    = mmdata;
2254:   C->product->destroy = MatDestroy_MatMatCusparse;

2256:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2257:   return 0;
2258: }

2260: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2261: {
2262:   Mat_Product                  *product = C->product;
2263:   Mat                          A,B;
2264:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2265:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2266:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2267:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2268:   PetscBool                    flg;
2269:   cusparseStatus_t             stat;
2270:   MatProductType               ptype;
2271:   MatMatCusparse               *mmdata;
2272: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273:   cusparseSpMatDescr_t         BmatSpDescr;
2274: #endif
2275:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2277:   MatCheckProduct(C,1);
2279:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2281:   mmdata = (MatMatCusparse*)C->product->data;
2282:   A = product->A;
2283:   B = product->B;
2284:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2285:     mmdata->reusesym = PETSC_FALSE;
2286:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288:     Cmat = Ccusp->mat;
2290:     Ccsr = (CsrMatrix*)Cmat->mat;
2292:     goto finalize;
2293:   }
2294:   if (!c->nz) goto finalize;
2295:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2297:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2301:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2302:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2303:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2307:   MatSeqAIJCUSPARSECopyToGPU(A);
2308:   MatSeqAIJCUSPARSECopyToGPU(B);

2310:   ptype = product->type;
2311:   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2312:     ptype = MATPRODUCT_AB;
2314:   }
2315:   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2316:     ptype = MATPRODUCT_AB;
2318:   }
2319:   switch (ptype) {
2320:   case MATPRODUCT_AB:
2321:     Amat = Acusp->mat;
2322:     Bmat = Bcusp->mat;
2323:     break;
2324:   case MATPRODUCT_AtB:
2325:     Amat = Acusp->matTranspose;
2326:     Bmat = Bcusp->mat;
2327:     break;
2328:   case MATPRODUCT_ABt:
2329:     Amat = Acusp->mat;
2330:     Bmat = Bcusp->matTranspose;
2331:     break;
2332:   default:
2333:     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2334:   }
2335:   Cmat = Ccusp->mat;
2339:   Acsr = (CsrMatrix*)Amat->mat;
2340:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2341:   Ccsr = (CsrMatrix*)Cmat->mat;
2345:   PetscLogGpuTimeBegin();
2346: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2347:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2348:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2349:   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2350:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2351:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2352:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2353:                                mmdata->spgemmDesc);stat;
2354:   #else
2355:     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2356:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2357:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2358:                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);stat;
2359:     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2360:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2361:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);stat;
2362:   #endif
2363: #else
2364:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2365:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2366:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2367:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2368:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());stat;
2369: #endif
2370:   PetscLogGpuFlops(mmdata->flops);
2371:   WaitForCUDA();
2372:   PetscLogGpuTimeEnd();
2373:   C->offloadmask = PETSC_OFFLOAD_GPU;
2374: finalize:
2375:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2376:   PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);
2377:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2378:   PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);
2379:   c->reallocs         = 0;
2380:   C->info.mallocs    += 0;
2381:   C->info.nz_unneeded = 0;
2382:   C->assembled = C->was_assembled = PETSC_TRUE;
2383:   C->num_ass++;
2384:   return 0;
2385: }

2387: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2388: {
2389:   Mat_Product                  *product = C->product;
2390:   Mat                          A,B;
2391:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2392:   Mat_SeqAIJ                   *a,*b,*c;
2393:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2394:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2395:   PetscInt                     i,j,m,n,k;
2396:   PetscBool                    flg;
2397:   cusparseStatus_t             stat;
2398:   MatProductType               ptype;
2399:   MatMatCusparse               *mmdata;
2400:   PetscLogDouble               flops;
2401:   PetscBool                    biscompressed,ciscompressed;
2402: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2403:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2404:   cusparseSpMatDescr_t         BmatSpDescr;
2405: #else
2406:   int                          cnz;
2407: #endif
2408:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2410:   MatCheckProduct(C,1);
2412:   A    = product->A;
2413:   B    = product->B;
2414:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2416:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2418:   a = (Mat_SeqAIJ*)A->data;
2419:   b = (Mat_SeqAIJ*)B->data;
2420:   /* product data */
2421:   PetscNew(&mmdata);
2422:   C->product->data    = mmdata;
2423:   C->product->destroy = MatDestroy_MatMatCusparse;

2425:   MatSeqAIJCUSPARSECopyToGPU(A);
2426:   MatSeqAIJCUSPARSECopyToGPU(B);
2427:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2428:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;

2432:   ptype = product->type;
2433:   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2434:     ptype = MATPRODUCT_AB;
2435:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2436:   }
2437:   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2438:     ptype = MATPRODUCT_AB;
2439:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2440:   }
2441:   biscompressed = PETSC_FALSE;
2442:   ciscompressed = PETSC_FALSE;
2443:   switch (ptype) {
2444:   case MATPRODUCT_AB:
2445:     m = A->rmap->n;
2446:     n = B->cmap->n;
2447:     k = A->cmap->n;
2448:     Amat = Acusp->mat;
2449:     Bmat = Bcusp->mat;
2450:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2451:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2452:     break;
2453:   case MATPRODUCT_AtB:
2454:     m = A->cmap->n;
2455:     n = B->cmap->n;
2456:     k = A->rmap->n;
2457:     MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2458:     Amat = Acusp->matTranspose;
2459:     Bmat = Bcusp->mat;
2460:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2461:     break;
2462:   case MATPRODUCT_ABt:
2463:     m = A->rmap->n;
2464:     n = B->rmap->n;
2465:     k = A->cmap->n;
2466:     MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2467:     Amat = Acusp->mat;
2468:     Bmat = Bcusp->matTranspose;
2469:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2470:     break;
2471:   default:
2472:     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2473:   }

2475:   /* create cusparse matrix */
2476:   MatSetSizes(C,m,n,m,n);
2477:   MatSetType(C,MATSEQAIJCUSPARSE);
2478:   c     = (Mat_SeqAIJ*)C->data;
2479:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2480:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2481:   Ccsr  = new CsrMatrix;

2483:   c->compressedrow.use = ciscompressed;
2484:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2485:     c->compressedrow.nrows = a->compressedrow.nrows;
2486:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2487:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2488:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2489:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2490:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2491:   } else {
2492:     c->compressedrow.nrows  = 0;
2493:     c->compressedrow.i      = NULL;
2494:     c->compressedrow.rindex = NULL;
2495:     Ccusp->workVector       = NULL;
2496:     Cmat->cprowIndices      = NULL;
2497:   }
2498:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2499:   Ccusp->mat      = Cmat;
2500:   Ccusp->mat->mat = Ccsr;
2501:   Ccsr->num_rows    = Ccusp->nrows;
2502:   Ccsr->num_cols    = n;
2503:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2504:   cusparseCreateMatDescr(&Cmat->descr);
2505:   cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
2506:   cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
2507:   cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));
2508:   cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));
2509:   cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2510:   cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
2511:   cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
2512:   cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
2513:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2514:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2515:     c->nz = 0;
2516:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2517:     Ccsr->values = new THRUSTARRAY(c->nz);
2518:     goto finalizesym;
2519:   }

2523:   Acsr = (CsrMatrix*)Amat->mat;
2524:   if (!biscompressed) {
2525:     Bcsr = (CsrMatrix*)Bmat->mat;
2526: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2527:     BmatSpDescr = Bmat->matDescr;
2528: #endif
2529:   } else { /* we need to use row offsets for the full matrix */
2530:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2531:     Bcsr = new CsrMatrix;
2532:     Bcsr->num_rows       = B->rmap->n;
2533:     Bcsr->num_cols       = cBcsr->num_cols;
2534:     Bcsr->num_entries    = cBcsr->num_entries;
2535:     Bcsr->column_indices = cBcsr->column_indices;
2536:     Bcsr->values         = cBcsr->values;
2537:     if (!Bcusp->rowoffsets_gpu) {
2538:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2539:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2540:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2541:     }
2542:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2543:     mmdata->Bcsr = Bcsr;
2544: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2545:     if (Bcsr->num_rows && Bcsr->num_cols) {
2546:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2547:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2548:                                Bcsr->values->data().get(),
2549:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2550:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
2551:     }
2552:     BmatSpDescr = mmdata->matSpBDescr;
2553: #endif
2554:   }
2557:   /* precompute flops count */
2558:   if (ptype == MATPRODUCT_AB) {
2559:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2560:       const PetscInt st = a->i[i];
2561:       const PetscInt en = a->i[i+1];
2562:       for (j=st; j<en; j++) {
2563:         const PetscInt brow = a->j[j];
2564:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2565:       }
2566:     }
2567:   } else if (ptype == MATPRODUCT_AtB) {
2568:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2569:       const PetscInt anzi = a->i[i+1] - a->i[i];
2570:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2571:       flops += (2.*anzi)*bnzi;
2572:     }
2573:   } else { /* TODO */
2574:     flops = 0.;
2575:   }

2577:   mmdata->flops = flops;
2578:   PetscLogGpuTimeBegin();

2580: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2581:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2582:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2583:                           NULL, NULL, NULL,
2584:                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2585:                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
2586:   cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2587:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2588:  {
2589:   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2590:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2591:   */
2592:   void*  dBuffer1 = NULL;
2593:   void*  dBuffer2 = NULL;
2594:   void*  dBuffer3 = NULL;
2595:   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2596:   size_t bufferSize1 = 0;
2597:   size_t bufferSize2 = 0;
2598:   size_t bufferSize3 = 0;
2599:   size_t bufferSize4 = 0;
2600:   size_t bufferSize5 = 0;

2602:   /*----------------------------------------------------------------------*/
2603:   /* ask bufferSize1 bytes for external memory */
2604:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2605:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2606:                                             &bufferSize1, NULL);stat;
2607:   cudaMalloc((void**) &dBuffer1, bufferSize1);
2608:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2609:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2610:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2611:                                             &bufferSize1, dBuffer1);stat;

2613:   /*----------------------------------------------------------------------*/
2614:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2615:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2616:                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);stat;
2617:   cudaMalloc((void**) &dBuffer2, bufferSize2);
2618:   cudaMalloc((void**) &dBuffer3, bufferSize3);
2619:   cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);
2620:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2621:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2622:                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);stat;
2623:   cudaFree(dBuffer1);
2624:   cudaFree(dBuffer2);

2626:   /*----------------------------------------------------------------------*/
2627:   /* get matrix C non-zero entries C_nnz1 */
2628:   cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2629:   c->nz = (PetscInt) C_nnz1;
2630:   /* allocate matrix C */
2631:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);cudaPeekAtLastError(); /* catch out of memory errors */
2632:   Ccsr->values         = new THRUSTARRAY(c->nz);cudaPeekAtLastError(); /* catch out of memory errors */
2633:   /* update matC with the new pointers */
2634:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2635:                                 Ccsr->values->data().get());stat;

2637:   /*----------------------------------------------------------------------*/
2638:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2639:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2640:                                   &bufferSize5, NULL);stat;
2641:   cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);
2642:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644:                                   &bufferSize5, mmdata->dBuffer5);stat;
2645:   cudaFree(dBuffer3);
2646:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2647:                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2648:                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2649:                                      mmdata->spgemmDesc);stat;
2650:   PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2651:  }
2652:  #else
2653:   size_t bufSize2;
2654:   /* ask bufferSize bytes for external memory */
2655:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2656:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2657:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2658:                                        mmdata->spgemmDesc, &bufSize2, NULL);stat;
2659:   cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);
2660:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2661:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2662:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2663:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2664:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);stat;
2665:   /* ask bufferSize again bytes for external memory */
2666:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2667:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2668:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2669:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);stat;
2670:   /* The CUSPARSE documentation is not clear, nor the API
2671:      We need both buffers to perform the operations properly!
2672:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2673:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2674:      is stored in the descriptor! What a messy API... */
2675:   cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);
2676:   /* compute the intermediate product of A * B */
2677:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2678:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2679:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2680:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);stat;
2681:   /* get matrix C non-zero entries C_nnz1 */
2682:   cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2683:   c->nz = (PetscInt) C_nnz1;
2684:   PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2685:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2686:   cudaPeekAtLastError(); /* catch out of memory errors */
2687:   Ccsr->values = new THRUSTARRAY(c->nz);
2688:   cudaPeekAtLastError(); /* catch out of memory errors */
2689:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2690:                                 Ccsr->values->data().get());stat;
2691:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2692:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2693:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);stat;
2694:  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2695: #else
2696:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);
2697:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2698:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2699:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2700:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2701:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);stat;
2702:   c->nz = cnz;
2703:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2704:   cudaPeekAtLastError(); /* catch out of memory errors */
2705:   Ccsr->values = new THRUSTARRAY(c->nz);
2706:   cudaPeekAtLastError(); /* catch out of memory errors */

2708:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2709:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2710:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2711:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2712:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2713:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2714:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2715:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2716:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());stat;
2717: #endif
2718:   PetscLogGpuFlops(mmdata->flops);
2719:   PetscLogGpuTimeEnd();
2720: finalizesym:
2721:   c->singlemalloc = PETSC_FALSE;
2722:   c->free_a       = PETSC_TRUE;
2723:   c->free_ij      = PETSC_TRUE;
2724:   PetscMalloc1(m+1,&c->i);
2725:   PetscMalloc1(c->nz,&c->j);
2726:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2727:     PetscInt *d_i = c->i;
2728:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2729:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2730:     ii   = *Ccsr->row_offsets;
2731:     jj   = *Ccsr->column_indices;
2732:     if (ciscompressed) d_i = c->compressedrow.i;
2733:     cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2734:     cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2735:   } else {
2736:     PetscInt *d_i = c->i;
2737:     if (ciscompressed) d_i = c->compressedrow.i;
2738:     cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2739:     cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
2740:   }
2741:   if (ciscompressed) { /* need to expand host row offsets */
2742:     PetscInt r = 0;
2743:     c->i[0] = 0;
2744:     for (k = 0; k < c->compressedrow.nrows; k++) {
2745:       const PetscInt next = c->compressedrow.rindex[k];
2746:       const PetscInt old = c->compressedrow.i[k];
2747:       for (; r < next; r++) c->i[r+1] = old;
2748:     }
2749:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2750:   }
2751:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2752:   PetscMalloc1(m,&c->ilen);
2753:   PetscMalloc1(m,&c->imax);
2754:   c->maxnz = c->nz;
2755:   c->nonzerorowcnt = 0;
2756:   c->rmax = 0;
2757:   for (k = 0; k < m; k++) {
2758:     const PetscInt nn = c->i[k+1] - c->i[k];
2759:     c->ilen[k] = c->imax[k] = nn;
2760:     c->nonzerorowcnt += (PetscInt)!!nn;
2761:     c->rmax = PetscMax(c->rmax,nn);
2762:   }
2763:   MatMarkDiagonal_SeqAIJ(C);
2764:   PetscMalloc1(c->nz,&c->a);
2765:   Ccsr->num_entries = c->nz;

2767:   C->nonzerostate++;
2768:   PetscLayoutSetUp(C->rmap);
2769:   PetscLayoutSetUp(C->cmap);
2770:   Ccusp->nonzerostate = C->nonzerostate;
2771:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2772:   C->preallocated  = PETSC_TRUE;
2773:   C->assembled     = PETSC_FALSE;
2774:   C->was_assembled = PETSC_FALSE;
2775:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2776:     mmdata->reusesym = PETSC_TRUE;
2777:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2778:   }
2779:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2780:   return 0;
2781: }

2783: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2785: /* handles sparse or dense B */
2786: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2787: {
2788:   Mat_Product    *product = mat->product;
2790:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2792:   MatCheckProduct(mat,1);
2793:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2794:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2795:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2796:   }
2797:   if (product->type == MATPRODUCT_ABC) {
2798:     Ciscusp = PETSC_FALSE;
2799:     if (!product->C->boundtocpu) {
2800:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2801:     }
2802:   }
2803:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2804:     PetscBool usecpu = PETSC_FALSE;
2805:     switch (product->type) {
2806:     case MATPRODUCT_AB:
2807:       if (product->api_user) {
2808:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2809:         PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2810:         PetscOptionsEnd();
2811:       } else {
2812:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2813:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2814:         PetscOptionsEnd();
2815:       }
2816:       break;
2817:     case MATPRODUCT_AtB:
2818:       if (product->api_user) {
2819:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2820:         PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2821:         PetscOptionsEnd();
2822:       } else {
2823:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2824:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2825:         PetscOptionsEnd();
2826:       }
2827:       break;
2828:     case MATPRODUCT_PtAP:
2829:       if (product->api_user) {
2830:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2831:         PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2832:         PetscOptionsEnd();
2833:       } else {
2834:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2835:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2836:         PetscOptionsEnd();
2837:       }
2838:       break;
2839:     case MATPRODUCT_RARt:
2840:       if (product->api_user) {
2841:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2842:         PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2843:         PetscOptionsEnd();
2844:       } else {
2845:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2846:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2847:         PetscOptionsEnd();
2848:       }
2849:       break;
2850:     case MATPRODUCT_ABC:
2851:       if (product->api_user) {
2852:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2853:         PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2854:         PetscOptionsEnd();
2855:       } else {
2856:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2857:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2858:         PetscOptionsEnd();
2859:       }
2860:       break;
2861:     default:
2862:       break;
2863:     }
2864:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2865:   }
2866:   /* dispatch */
2867:   if (isdense) {
2868:     switch (product->type) {
2869:     case MATPRODUCT_AB:
2870:     case MATPRODUCT_AtB:
2871:     case MATPRODUCT_ABt:
2872:     case MATPRODUCT_PtAP:
2873:     case MATPRODUCT_RARt:
2874:      if (product->A->boundtocpu) {
2875:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2876:       } else {
2877:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2878:       }
2879:       break;
2880:     case MATPRODUCT_ABC:
2881:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2882:       break;
2883:     default:
2884:       break;
2885:     }
2886:   } else if (Biscusp && Ciscusp) {
2887:     switch (product->type) {
2888:     case MATPRODUCT_AB:
2889:     case MATPRODUCT_AtB:
2890:     case MATPRODUCT_ABt:
2891:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2892:       break;
2893:     case MATPRODUCT_PtAP:
2894:     case MATPRODUCT_RARt:
2895:     case MATPRODUCT_ABC:
2896:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2897:       break;
2898:     default:
2899:       break;
2900:     }
2901:   } else { /* fallback for AIJ */
2902:     MatProductSetFromOptions_SeqAIJ(mat);
2903:   }
2904:   return 0;
2905: }

2907: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2908: {
2909:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2910:   return 0;
2911: }

2913: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2914: {
2915:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2916:   return 0;
2917: }

2919: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2920: {
2921:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2922:   return 0;
2923: }

2925: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2926: {
2927:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2928:   return 0;
2929: }

2931: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2932: {
2933:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2934:   return 0;
2935: }

2937: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2938: {
2939:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2940:   if (i < n) y[idx[i]] += x[i];
2941: }

2943: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2944: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2945: {
2946:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2947:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2948:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2949:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2950:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2951:   PetscBool                    compressed;
2952: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2953:   PetscInt                     nx,ny;
2954: #endif

2957:   if (!a->nz) {
2958:     if (!yy) VecSet_SeqCUDA(zz,0);
2959:     else VecCopy_SeqCUDA(yy,zz);
2960:     return 0;
2961:   }
2962:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2963:   MatSeqAIJCUSPARSECopyToGPU(A);
2964:   if (!trans) {
2965:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2967:   } else {
2968:     if (herm || !A->form_explicit_transpose) {
2969:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2970:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2971:     } else {
2972:       if (!cusparsestruct->matTranspose) MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2973:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2974:     }
2975:   }
2976:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2977:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2979:   try {
2980:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2981:     if (yy == zz) VecCUDAGetArray(zz,&zarray); /* read & write zz, so need to get uptodate zarray on GPU */
2982:     else VecCUDAGetArrayWrite(zz,&zarray); /* write zz, so no need to init zarray on GPU */

2984:     PetscLogGpuTimeBegin();
2985:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2986:       /* z = A x + beta y.
2987:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2988:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2989:       */
2990:       xptr = xarray;
2991:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2992:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2993:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2994:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2995:           allocated to accommodate different uses. So we get the length info directly from mat.
2996:        */
2997:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2998:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2999:         nx = mat->num_cols;
3000:         ny = mat->num_rows;
3001:       }
3002:      #endif
3003:     } else {
3004:       /* z = A^T x + beta y
3005:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3006:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3007:        */
3008:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3009:       dptr = zarray;
3010:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3011:       if (compressed) { /* Scatter x to work vector */
3012:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3013:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3014:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3015:                          VecCUDAEqualsReverse());
3016:       }
3017:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3018:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3019:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3020:         nx = mat->num_rows;
3021:         ny = mat->num_cols;
3022:       }
3023:      #endif
3024:     }

3026:     /* csr_spmv does y = alpha op(A) x + beta y */
3027:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3028:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3030:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3031:         cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);
3032:         cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);
3033:         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3034:                                                matstruct->matDescr,
3035:                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3036:                                                matstruct->cuSpMV[opA].vecYDescr,
3037:                                                cusparse_scalartype,
3038:                                                cusparsestruct->spmvAlg,
3039:                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3040:         cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);

3042:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3043:       } else {
3044:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3045:         cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);
3046:         cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);
3047:       }

3049:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3050:                                   matstruct->alpha_one,
3051:                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3052:                                   matstruct->cuSpMV[opA].vecXDescr,
3053:                                   beta,
3054:                                   matstruct->cuSpMV[opA].vecYDescr,
3055:                                   cusparse_scalartype,
3056:                                   cusparsestruct->spmvAlg,
3057:                                   matstruct->cuSpMV[opA].spmvBuffer));
3058:      #else
3059:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3060:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3061:                                        mat->num_rows, mat->num_cols,
3062:                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3063:                                        mat->values->data().get(), mat->row_offsets->data().get(),
3064:                                        mat->column_indices->data().get(), xptr, beta,
3065:                                        dptr));
3066:      #endif
3067:     } else {
3068:       if (cusparsestruct->nrows) {
3069:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3071:        #else
3072:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3073:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3074:                                          matstruct->alpha_one, matstruct->descr, hybMat,
3075:                                          xptr, beta,
3076:                                          dptr));
3077:        #endif
3078:       }
3079:     }
3080:     PetscLogGpuTimeEnd();

3082:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3083:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3084:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3085:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3086:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3087:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3088:         }
3089:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3090:         VecSet_SeqCUDA(zz,0);
3091:       }

3093:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3094:       if (compressed) {
3095:         PetscLogGpuTimeBegin();
3096:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3097:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3098:            prevent that. So I just add a ScatterAdd kernel.
3099:          */
3100:        #if 0
3101:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3102:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3103:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3104:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3105:                          VecCUDAPlusEquals());
3106:        #else
3107:         PetscInt n = matstruct->cprowIndices->size();
3108:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3109:        #endif
3110:         PetscLogGpuTimeEnd();
3111:       }
3112:     } else {
3113:       if (yy && yy != zz) {
3114:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3115:       }
3116:     }
3117:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3118:     if (yy == zz) VecCUDARestoreArray(zz,&zarray);
3119:     else VecCUDARestoreArrayWrite(zz,&zarray);
3120:   } catch(char *ex) {
3121:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3122:   }
3123:   if (yy) {
3124:     PetscLogGpuFlops(2.0*a->nz);
3125:   } else {
3126:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3127:   }
3128:   return 0;
3129: }

3131: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3132: {
3133:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3134:   return 0;
3135: }

3137: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3138: {
3139:   PetscObjectState   onnz = A->nonzerostate;
3140:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

3142:   MatAssemblyEnd_SeqAIJ(A,mode);
3143:   if (onnz != A->nonzerostate && cusp->deviceMat) {

3145:     PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3146:     cudaFree(cusp->deviceMat);
3147:     cusp->deviceMat = NULL;
3148:   }
3149:   return 0;
3150: }

3152: /* --------------------------------------------------------------------------------*/
3153: /*@
3154:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3155:    (the default parallel PETSc format). This matrix will ultimately pushed down
3156:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3157:    assembly performance the user should preallocate the matrix storage by setting
3158:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3159:    performance during matrix assembly can be increased by more than a factor of 50.

3161:    Collective

3163:    Input Parameters:
3164: +  comm - MPI communicator, set to PETSC_COMM_SELF
3165: .  m - number of rows
3166: .  n - number of columns
3167: .  nz - number of nonzeros per row (same for all rows)
3168: -  nnz - array containing the number of nonzeros in the various rows
3169:          (possibly different for each row) or NULL

3171:    Output Parameter:
3172: .  A - the matrix

3174:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3175:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3176:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3178:    Notes:
3179:    If nnz is given then nz is ignored

3181:    The AIJ format (also called the Yale sparse matrix format or
3182:    compressed row storage), is fully compatible with standard Fortran 77
3183:    storage.  That is, the stored row and column indices can begin at
3184:    either one (as in Fortran) or zero.  See the users' manual for details.

3186:    Specify the preallocated storage with either nz or nnz (not both).
3187:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3188:    allocation.  For large problems you MUST preallocate memory or you
3189:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3191:    By default, this format uses inodes (identical nodes) when possible, to
3192:    improve numerical efficiency of matrix-vector products and solves. We
3193:    search for consecutive rows with the same nonzero structure, thereby
3194:    reusing matrix information to achieve increased efficiency.

3196:    Level: intermediate

3198: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3199: @*/
3200: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3201: {
3202:   MatCreate(comm,A);
3203:   MatSetSizes(*A,m,n,m,n);
3204:   MatSetType(*A,MATSEQAIJCUSPARSE);
3205:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3206:   return 0;
3207: }

3209: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3210: {
3211:   if (A->factortype == MAT_FACTOR_NONE) {
3212:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3213:   } else {
3214:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3215:   }
3216:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3217:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3218:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);
3219:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3220:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3221:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3222:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3223:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3224:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3225:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3226:   MatDestroy_SeqAIJ(A);
3227:   return 0;
3228: }

3230: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3231: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3232: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3233: {
3234:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3235:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3236:   return 0;
3237: }

3239: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3240: {
3241:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3242:   Mat_SeqAIJCUSPARSE *cy;
3243:   Mat_SeqAIJCUSPARSE *cx;
3244:   PetscScalar        *ay;
3245:   const PetscScalar  *ax;
3246:   CsrMatrix          *csry,*csrx;

3248:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3249:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3250:   if (X->ops->axpy != Y->ops->axpy) {
3251:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3252:     MatAXPY_SeqAIJ(Y,a,X,str);
3253:     return 0;
3254:   }
3255:   /* if we are here, it means both matrices are bound to GPU */
3256:   MatSeqAIJCUSPARSECopyToGPU(Y);
3257:   MatSeqAIJCUSPARSECopyToGPU(X);
3260:   csry = (CsrMatrix*)cy->mat->mat;
3261:   csrx = (CsrMatrix*)cx->mat->mat;
3262:   /* see if we can turn this into a cublas axpy */
3263:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3264:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3265:     if (eq) {
3266:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3267:     }
3268:     if (eq) str = SAME_NONZERO_PATTERN;
3269:   }
3270:   /* spgeam is buggy with one column */
3271:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3273:   if (str == SUBSET_NONZERO_PATTERN) {
3274:     PetscScalar b = 1.0;
3275: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3276:     size_t      bufferSize;
3277:     void        *buffer;
3278: #endif

3280:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3281:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3282:     cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);
3283: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3284:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3285:                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3286:                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3287:                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3288:     cudaMalloc(&buffer,bufferSize);
3289:     PetscLogGpuTimeBegin();
3290:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3291:                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3292:                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3293:                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3294:     PetscLogGpuFlops(x->nz + y->nz);
3295:     PetscLogGpuTimeEnd();
3296:     cudaFree(buffer);
3297: #else
3298:     PetscLogGpuTimeBegin();
3299:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3300:                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3301:                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3302:                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3303:     PetscLogGpuFlops(x->nz + y->nz);
3304:     PetscLogGpuTimeEnd();
3305: #endif
3306:     cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);
3307:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3308:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3309:     MatSeqAIJInvalidateDiagonal(Y);
3310:   } else if (str == SAME_NONZERO_PATTERN) {
3311:     cublasHandle_t cublasv2handle;
3312:     PetscBLASInt   one = 1, bnz = 1;

3314:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3315:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3316:     PetscCUBLASGetHandle(&cublasv2handle);
3317:     PetscBLASIntCast(x->nz,&bnz);
3318:     PetscLogGpuTimeBegin();
3319:     cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);
3320:     PetscLogGpuFlops(2.0*bnz);
3321:     PetscLogGpuTimeEnd();
3322:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3323:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3324:     MatSeqAIJInvalidateDiagonal(Y);
3325:   } else {
3326:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3327:     MatAXPY_SeqAIJ(Y,a,X,str);
3328:   }
3329:   return 0;
3330: }

3332: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3333: {
3334:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3335:   PetscScalar    *ay;
3336:   cublasHandle_t cublasv2handle;
3337:   PetscBLASInt   one = 1, bnz = 1;

3339:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3340:   PetscCUBLASGetHandle(&cublasv2handle);
3341:   PetscBLASIntCast(y->nz,&bnz);
3342:   PetscLogGpuTimeBegin();
3343:   cublasXscal(cublasv2handle,bnz,&a,ay,one);
3344:   PetscLogGpuFlops(bnz);
3345:   PetscLogGpuTimeEnd();
3346:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3347:   MatSeqAIJInvalidateDiagonal(Y);
3348:   return 0;
3349: }

3351: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3352: {
3353:   PetscBool      both = PETSC_FALSE;
3354:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3356:   if (A->factortype == MAT_FACTOR_NONE) {
3357:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3358:     if (spptr->mat) {
3359:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3360:       if (matrix->values) {
3361:         both = PETSC_TRUE;
3362:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3363:       }
3364:     }
3365:     if (spptr->matTranspose) {
3366:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3367:       if (matrix->values) {
3368:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3369:       }
3370:     }
3371:   }
3372:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3373:   MatSeqAIJInvalidateDiagonal(A);
3374:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3375:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3376:   return 0;
3377: }

3379: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3380: {
3381:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3383:   if (A->factortype != MAT_FACTOR_NONE) {
3384:     A->boundtocpu = flg;
3385:     return 0;
3386:   }
3387:   if (flg) {
3388:     MatSeqAIJCUSPARSECopyFromGPU(A);

3390:     A->ops->scale                     = MatScale_SeqAIJ;
3391:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3392:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3393:     A->ops->mult                      = MatMult_SeqAIJ;
3394:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3395:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3396:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3397:     A->ops->multhermitiantranspose    = NULL;
3398:     A->ops->multhermitiantransposeadd = NULL;
3399:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3400:     PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));
3401:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3402:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3403:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3404:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3405:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3406:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3407:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3408:   } else {
3409:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3410:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3411:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3412:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3413:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3414:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3415:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3416:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3417:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3418:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3419:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3420:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3421:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3422:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3423:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3424:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3425:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

3427:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3428:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3429:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3430:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3431:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3432:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3433:    }
3434:   A->boundtocpu = flg;
3435:   if (flg && a->inode.size) {
3436:     a->inode.use = PETSC_TRUE;
3437:   } else {
3438:     a->inode.use = PETSC_FALSE;
3439:   }
3440:   return 0;
3441: }

3443: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3444: {
3445:   Mat              B;

3447:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* first use of CUSPARSE may be via MatConvert */
3448:   if (reuse == MAT_INITIAL_MATRIX) {
3449:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3450:   } else if (reuse == MAT_REUSE_MATRIX) {
3451:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3452:   }
3453:   B = *newmat;

3455:   PetscFree(B->defaultvectype);
3456:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3458:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3459:     if (B->factortype == MAT_FACTOR_NONE) {
3460:       Mat_SeqAIJCUSPARSE *spptr;
3461:       PetscNew(&spptr);
3462:       cusparseCreate(&spptr->handle);
3463:       cusparseSetStream(spptr->handle,PetscDefaultCudaStream);
3464:       spptr->format     = MAT_CUSPARSE_CSR;
3465:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3466:      #if CUSPARSE_VERSION > 11301
3467:       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3468:      #else
3469:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3470:      #endif
3471:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3472:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3473:      #endif
3474:       B->spptr = spptr;
3475:     } else {
3476:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3478:       PetscNew(&spptr);
3479:       cusparseCreate(&spptr->handle);
3480:       cusparseSetStream(spptr->handle,PetscDefaultCudaStream);
3481:       B->spptr = spptr;
3482:     }
3483:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3484:   }
3485:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3486:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3487:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3488:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3489:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3490:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3492:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3493:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3494:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3495: #if defined(PETSC_HAVE_HYPRE)
3496:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3497: #endif
3498:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);
3499:   return 0;
3500: }

3502: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3503: {
3504:   MatCreate_SeqAIJ(B);
3505:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3506:   return 0;
3507: }

3509: /*MC
3510:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3512:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3513:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3514:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3516:    Options Database Keys:
3517: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3518: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3519: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3520: +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU

3522:   Level: beginner

3524: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3525: M*/

3527: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3529: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3530: {
3531:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3532:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3533:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3534:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3535:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3537:   return 0;
3538: }

3540: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3541: {
3542:   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;

3544:   if (!cusp) return 0;
3545:   delete cusp->cooPerm;
3546:   delete cusp->cooPerm_a;
3547:   cusp->cooPerm = NULL;
3548:   cusp->cooPerm_a = NULL;
3549:   if (cusp->use_extended_coo) {
3550:     cudaFree(cusp->jmap_d);
3551:     cudaFree(cusp->perm_d);
3552:   }
3553:   cusp->use_extended_coo = PETSC_FALSE;
3554:   return 0;
3555: }

3557: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3558: {
3559:   if (*cusparsestruct) {
3560:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3561:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3562:     delete (*cusparsestruct)->workVector;
3563:     delete (*cusparsestruct)->rowoffsets_gpu;
3564:     delete (*cusparsestruct)->cooPerm;
3565:     delete (*cusparsestruct)->cooPerm_a;
3566:     delete (*cusparsestruct)->csr2csc_i;
3567:     if ((*cusparsestruct)->handle) cusparseDestroy((*cusparsestruct)->handle);
3568:     if ((*cusparsestruct)->jmap_d) cudaFree((*cusparsestruct)->jmap_d);
3569:     if ((*cusparsestruct)->perm_d) cudaFree((*cusparsestruct)->perm_d);
3570:     PetscFree(*cusparsestruct);
3571:   }
3572:   return 0;
3573: }

3575: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3576: {
3577:   if (*mat) {
3578:     delete (*mat)->values;
3579:     delete (*mat)->column_indices;
3580:     delete (*mat)->row_offsets;
3581:     delete *mat;
3582:     *mat = 0;
3583:   }
3584:   return 0;
3585: }

3587: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3588: {
3589:   if (*trifactor) {
3590:     if ((*trifactor)->descr) cusparseDestroyMatDescr((*trifactor)->descr);
3591:     if ((*trifactor)->solveInfo) cusparse_destroy_analysis_info((*trifactor)->solveInfo);
3592:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3593:     if ((*trifactor)->solveBuffer)   cudaFree((*trifactor)->solveBuffer);
3594:     if ((*trifactor)->AA_h)   cudaFreeHost((*trifactor)->AA_h);
3595:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3596:     if ((*trifactor)->csr2cscBuffer) cudaFree((*trifactor)->csr2cscBuffer);
3597:    #endif
3598:     PetscFree(*trifactor);
3599:   }
3600:   return 0;
3601: }

3603: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3604: {
3605:   CsrMatrix        *mat;

3607:   if (*matstruct) {
3608:     if ((*matstruct)->mat) {
3609:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3610:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3611:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3612:        #else
3613:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3614:         cusparseDestroyHybMat(hybMat);
3615:        #endif
3616:       } else {
3617:         mat = (CsrMatrix*)(*matstruct)->mat;
3618:         CsrMatrix_Destroy(&mat);
3619:       }
3620:     }
3621:     if ((*matstruct)->descr) cusparseDestroyMatDescr((*matstruct)->descr);
3622:     delete (*matstruct)->cprowIndices;
3623:     if ((*matstruct)->alpha_one) cudaFree((*matstruct)->alpha_one);
3624:     if ((*matstruct)->beta_zero) cudaFree((*matstruct)->beta_zero);
3625:     if ((*matstruct)->beta_one)  cudaFree((*matstruct)->beta_one);

3627:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3628:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3629:     if (mdata->matDescr) cusparseDestroySpMat(mdata->matDescr);
3630:     for (int i=0; i<3; i++) {
3631:       if (mdata->cuSpMV[i].initialized) {
3632:         cudaFree(mdata->cuSpMV[i].spmvBuffer);
3633:         cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);
3634:         cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);
3635:       }
3636:     }
3637:    #endif
3638:     delete *matstruct;
3639:     *matstruct = NULL;
3640:   }
3641:   return 0;
3642: }

3644: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3645: {
3646:   if (*trifactors) {
3647:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3648:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3649:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3650:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3651:     delete (*trifactors)->rpermIndices;
3652:     delete (*trifactors)->cpermIndices;
3653:     delete (*trifactors)->workVector;
3654:     (*trifactors)->rpermIndices = NULL;
3655:     (*trifactors)->cpermIndices = NULL;
3656:     (*trifactors)->workVector = NULL;
3657:     if ((*trifactors)->a_band_d)   cudaFree((*trifactors)->a_band_d);
3658:     if ((*trifactors)->i_band_d)   cudaFree((*trifactors)->i_band_d);
3659:     (*trifactors)->init_dev_prop = PETSC_FALSE;
3660:   }
3661:   return 0;
3662: }

3664: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3665: {
3666:   cusparseHandle_t handle;

3668:   if (*trifactors) {
3669:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3670:     if (handle = (*trifactors)->handle) {
3671:       cusparseDestroy(handle);
3672:     }
3673:     PetscFree(*trifactors);
3674:   }
3675:   return 0;
3676: }

3678: struct IJCompare
3679: {
3680:   __host__ __device__
3681:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3682:   {
3683:     if (t1.get<0>() < t2.get<0>()) return true;
3684:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3685:     return false;
3686:   }
3687: };

3689: struct IJEqual
3690: {
3691:   __host__ __device__
3692:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3693:   {
3694:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3695:     return true;
3696:   }
3697: };

3699: struct IJDiff
3700: {
3701:   __host__ __device__
3702:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3703:   {
3704:     return t1 == t2 ? 0 : 1;
3705:   }
3706: };

3708: struct IJSum
3709: {
3710:   __host__ __device__
3711:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3712:   {
3713:     return t1||t2;
3714:   }
3715: };

3717: #include <thrust/iterator/discard_iterator.h>
3718: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3719: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3720: {
3721:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3722:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3723:   THRUSTARRAY                           *cooPerm_v = NULL;
3724:   thrust::device_ptr<const PetscScalar> d_v;
3725:   CsrMatrix                             *matrix;
3726:   PetscInt                              n;

3730:   if (!cusp->cooPerm) {
3731:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3732:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3733:     return 0;
3734:   }
3735:   matrix = (CsrMatrix*)cusp->mat->mat;
3737:   if (!v) {
3738:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3739:     goto finalize;
3740:   }
3741:   n = cusp->cooPerm->size();
3742:   if (isCudaMem(v)) {
3743:     d_v = thrust::device_pointer_cast(v);
3744:   } else {
3745:     cooPerm_v = new THRUSTARRAY(n);
3746:     cooPerm_v->assign(v,v+n);
3747:     d_v = cooPerm_v->data();
3748:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3749:   }
3750:   PetscLogGpuTimeBegin();
3751:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3752:     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3753:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3754:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3755:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3756:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3757:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3758:       */
3759:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3760:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3761:       delete cooPerm_w;
3762:     } else {
3763:       /* all nonzeros in d_v[] are unique entries */
3764:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3765:                                                                 matrix->values->begin()));
3766:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3767:                                                                 matrix->values->end()));
3768:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3769:     }
3770:   } else {
3771:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3772:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3773:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3774:     } else {
3775:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3776:                                                                 matrix->values->begin()));
3777:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3778:                                                                 matrix->values->end()));
3779:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3780:     }
3781:   }
3782:   PetscLogGpuTimeEnd();
3783: finalize:
3784:   delete cooPerm_v;
3785:   A->offloadmask = PETSC_OFFLOAD_GPU;
3786:   PetscObjectStateIncrease((PetscObject)A);
3787:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3788:   PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);
3789:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3790:   PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);
3791:   a->reallocs         = 0;
3792:   A->info.mallocs    += 0;
3793:   A->info.nz_unneeded = 0;
3794:   A->assembled = A->was_assembled = PETSC_TRUE;
3795:   A->num_ass++;
3796:   return 0;
3797: }

3799: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3800: {
3801:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

3804:   if (!cusp) return 0;
3805:   if (destroy) {
3806:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3807:     delete cusp->csr2csc_i;
3808:     cusp->csr2csc_i = NULL;
3809:   }
3810:   A->transupdated = PETSC_FALSE;
3811:   return 0;
3812: }

3814: #include <thrust/binary_search.h>
3815: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3816: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3817: {
3818:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3819:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3820:   PetscInt           cooPerm_n, nzr = 0;

3822:   PetscLayoutSetUp(A->rmap);
3823:   PetscLayoutSetUp(A->cmap);
3824:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3825:   if (n != cooPerm_n) {
3826:     delete cusp->cooPerm;
3827:     delete cusp->cooPerm_a;
3828:     cusp->cooPerm = NULL;
3829:     cusp->cooPerm_a = NULL;
3830:   }
3831:   if (n) {
3832:     THRUSTINTARRAY d_i(n);
3833:     THRUSTINTARRAY d_j(n);
3834:     THRUSTINTARRAY ii(A->rmap->n);

3836:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3837:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3839:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3840:     d_i.assign(coo_i,coo_i+n);
3841:     d_j.assign(coo_j,coo_j+n);

3843:     /* Ex.
3844:       n = 6
3845:       coo_i = [3,3,1,4,1,4]
3846:       coo_j = [3,2,2,5,2,6]
3847:     */
3848:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3849:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3851:     PetscLogGpuTimeBegin();
3852:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3853:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3854:     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3855:     THRUSTINTARRAY w = d_j;

3857:     /*
3858:       d_i     = [1,1,3,3,4,4]
3859:       d_j     = [2,2,2,3,5,6]
3860:       cooPerm = [2,4,1,0,3,5]
3861:     */
3862:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

3864:     /*
3865:       d_i     = [1,3,3,4,4,x]
3866:                             ^ekey
3867:       d_j     = [2,2,3,5,6,x]
3868:                            ^nekye
3869:     */
3870:     if (nekey == ekey) { /* all entries are unique */
3871:       delete cusp->cooPerm_a;
3872:       cusp->cooPerm_a = NULL;
3873:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3874:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3875:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3876:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3877:       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3878:       w[0] = 0;
3879:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3880:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3881:     }
3882:     thrust::counting_iterator<PetscInt> search_begin(0);
3883:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3884:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3885:                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3886:     PetscLogGpuTimeEnd();

3888:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3889:     a->singlemalloc = PETSC_FALSE;
3890:     a->free_a       = PETSC_TRUE;
3891:     a->free_ij      = PETSC_TRUE;
3892:     PetscMalloc1(A->rmap->n+1,&a->i);
3893:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3894:     cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);
3895:     a->nz = a->maxnz = a->i[A->rmap->n];
3896:     a->rmax = 0;
3897:     PetscMalloc1(a->nz,&a->a);
3898:     PetscMalloc1(a->nz,&a->j);
3899:     cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);
3900:     if (!a->ilen) PetscMalloc1(A->rmap->n,&a->ilen);
3901:     if (!a->imax) PetscMalloc1(A->rmap->n,&a->imax);
3902:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3903:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3904:       nzr += (PetscInt)!!(nnzr);
3905:       a->ilen[i] = a->imax[i] = nnzr;
3906:       a->rmax = PetscMax(a->rmax,nnzr);
3907:     }
3908:     a->nonzerorowcnt = nzr;
3909:     A->preallocated = PETSC_TRUE;
3910:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3911:     MatMarkDiagonal_SeqAIJ(A);
3912:   } else {
3913:     MatSeqAIJSetPreallocation(A,0,NULL);
3914:   }
3915:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3917:   /* We want to allocate the CUSPARSE struct for matvec now.
3918:      The code is so convoluted now that I prefer to copy zeros */
3919:   PetscArrayzero(a->a,a->nz);
3920:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3921:   A->offloadmask = PETSC_OFFLOAD_CPU;
3922:   MatSeqAIJCUSPARSECopyToGPU(A);
3923:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3924:   return 0;
3925: }

3927: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3928: {
3929:   Mat_SeqAIJ         *seq;
3930:   Mat_SeqAIJCUSPARSE *dev;
3931:   PetscBool          coo_basic = PETSC_TRUE;
3932:   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;

3934:   MatResetPreallocationCOO_SeqAIJ(mat);
3935:   MatResetPreallocationCOO_SeqAIJCUSPARSE(mat);
3936:   if (coo_i) {
3937:     PetscGetMemType(coo_i,&mtype);
3938:     if (PetscMemTypeHost(mtype)) {
3939:       for (PetscCount k=0; k<coo_n; k++) {
3940:         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
3941:       }
3942:     }
3943:   }

3945:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
3946:     MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);
3947:   } else {
3948:     MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);
3949:     mat->offloadmask = PETSC_OFFLOAD_CPU;
3950:     MatSeqAIJCUSPARSECopyToGPU(mat);
3951:     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
3952:     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
3953:     cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));
3954:     cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
3955:     cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));
3956:     cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);
3957:     dev->use_extended_coo = PETSC_TRUE;
3958:   }
3959:   return 0;
3960: }

3962: __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
3963: {
3964:   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
3965:   const PetscCount  grid_size = gridDim.x * blockDim.x;
3966:   for (; i<nnz; i+= grid_size) {
3967:     PetscScalar sum = 0.0;
3968:     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
3969:     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
3970:   }
3971: }

3973: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3974: {
3975:   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
3976:   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
3977:   PetscCount          Annz = seq->nz;
3978:   PetscMemType        memtype;
3979:   const PetscScalar   *v1 = v;
3980:   PetscScalar         *Aa;

3982:   if (dev->use_extended_coo) {
3983:     PetscGetMemType(v,&memtype);
3984:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3985:       cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));
3986:       cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
3987:     }

3989:     if (imode == INSERT_VALUES) MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);
3990:     else MatSeqAIJCUSPARSEGetArray(A,&Aa);

3992:     if (Annz) {
3993:       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
3994:       cudaPeekAtLastError();
3995:     }

3997:     if (imode == INSERT_VALUES) MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);
3998:     else MatSeqAIJCUSPARSERestoreArray(A,&Aa);

4000:     if (PetscMemTypeHost(memtype)) cudaFree((void*)v1);
4001:   } else {
4002:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);
4003:   }
4004:   return 0;
4005: }

4007: /*@C
4008:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.

4010:    Not collective

4012:     Input Parameters:
4013: +   A - the matrix
4014: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4016:     Output Parameters:
4017: +   ia - the CSR row pointers
4018: -   ja - the CSR column indices

4020:     Level: developer

4022:     Notes:
4023:       When compressed is true, the CSR structure does not contain empty rows

4025: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4026: @*/
4027: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4028: {
4029:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4030:   CsrMatrix          *csr;
4031:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;

4034:   if (!i || !j) return 0;
4037:   MatSeqAIJCUSPARSECopyToGPU(A);
4039:   csr = (CsrMatrix*)cusp->mat->mat;
4040:   if (i) {
4041:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4042:       if (!cusp->rowoffsets_gpu) {
4043:         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4044:         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4045:         PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4046:       }
4047:       *i = cusp->rowoffsets_gpu->data().get();
4048:     } else *i = csr->row_offsets->data().get();
4049:   }
4050:   if (j) *j = csr->column_indices->data().get();
4051:   return 0;
4052: }

4054: /*@C
4055:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()

4057:    Not collective

4059:     Input Parameters:
4060: +   A - the matrix
4061: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4063:     Output Parameters:
4064: +   ia - the CSR row pointers
4065: -   ja - the CSR column indices

4067:     Level: developer

4069: .seealso: MatSeqAIJCUSPARSEGetIJ()
4070: @*/
4071: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4072: {
4075:   if (i) *i = NULL;
4076:   if (j) *j = NULL;
4077:   return 0;
4078: }

4080: /*@C
4081:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4083:    Not Collective

4085:    Input Parameter:
4086: .   A - a MATSEQAIJCUSPARSE matrix

4088:    Output Parameter:
4089: .   a - pointer to the device data

4091:    Level: developer

4093:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4095: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4096: @*/
4097: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4098: {
4099:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4100:   CsrMatrix          *csr;

4106:   MatSeqAIJCUSPARSECopyToGPU(A);
4108:   csr = (CsrMatrix*)cusp->mat->mat;
4110:   *a = csr->values->data().get();
4111:   return 0;
4112: }

4114: /*@C
4115:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()

4117:    Not Collective

4119:    Input Parameter:
4120: .   A - a MATSEQAIJCUSPARSE matrix

4122:    Output Parameter:
4123: .   a - pointer to the device data

4125:    Level: developer

4127: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4128: @*/
4129: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4130: {
4134:   *a = NULL;
4135:   return 0;
4136: }

4138: /*@C
4139:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4141:    Not Collective

4143:    Input Parameter:
4144: .   A - a MATSEQAIJCUSPARSE matrix

4146:    Output Parameter:
4147: .   a - pointer to the device data

4149:    Level: developer

4151:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4153: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4154: @*/
4155: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4156: {
4157:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4158:   CsrMatrix          *csr;

4164:   MatSeqAIJCUSPARSECopyToGPU(A);
4166:   csr = (CsrMatrix*)cusp->mat->mat;
4168:   *a = csr->values->data().get();
4169:   A->offloadmask = PETSC_OFFLOAD_GPU;
4170:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4171:   return 0;
4172: }
4173: /*@C
4174:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()

4176:    Not Collective

4178:    Input Parameter:
4179: .   A - a MATSEQAIJCUSPARSE matrix

4181:    Output Parameter:
4182: .   a - pointer to the device data

4184:    Level: developer

4186: .seealso: MatSeqAIJCUSPARSEGetArray()
4187: @*/
4188: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4189: {
4193:   MatSeqAIJInvalidateDiagonal(A);
4194:   PetscObjectStateIncrease((PetscObject)A);
4195:   *a = NULL;
4196:   return 0;
4197: }

4199: /*@C
4200:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4202:    Not Collective

4204:    Input Parameter:
4205: .   A - a MATSEQAIJCUSPARSE matrix

4207:    Output Parameter:
4208: .   a - pointer to the device data

4210:    Level: developer

4212:    Notes: does not trigger host-device copies and flags data validity on the GPU

4214: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4215: @*/
4216: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4217: {
4218:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4219:   CsrMatrix          *csr;

4226:   csr = (CsrMatrix*)cusp->mat->mat;
4228:   *a = csr->values->data().get();
4229:   A->offloadmask = PETSC_OFFLOAD_GPU;
4230:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4231:   return 0;
4232: }

4234: /*@C
4235:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()

4237:    Not Collective

4239:    Input Parameter:
4240: .   A - a MATSEQAIJCUSPARSE matrix

4242:    Output Parameter:
4243: .   a - pointer to the device data

4245:    Level: developer

4247: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4248: @*/
4249: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4250: {
4254:   MatSeqAIJInvalidateDiagonal(A);
4255:   PetscObjectStateIncrease((PetscObject)A);
4256:   *a = NULL;
4257:   return 0;
4258: }

4260: struct IJCompare4
4261: {
4262:   __host__ __device__
4263:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4264:   {
4265:     if (t1.get<0>() < t2.get<0>()) return true;
4266:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4267:     return false;
4268:   }
4269: };

4271: struct Shift
4272: {
4273:   int _shift;

4275:   Shift(int shift) : _shift(shift) {}
4276:   __host__ __device__
4277:   inline int operator() (const int &c)
4278:   {
4279:     return c + _shift;
4280:   }
4281: };

4283: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4284: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4285: {
4286:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4287:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4288:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4289:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4290:   PetscInt                     Annz,Bnnz;
4291:   cusparseStatus_t             stat;
4292:   PetscInt                     i,m,n,zero = 0;

4303:   if (reuse == MAT_INITIAL_MATRIX) {
4304:     m     = A->rmap->n;
4305:     n     = A->cmap->n + B->cmap->n;
4306:     MatCreate(PETSC_COMM_SELF,C);
4307:     MatSetSizes(*C,m,n,m,n);
4308:     MatSetType(*C,MATSEQAIJCUSPARSE);
4309:     c     = (Mat_SeqAIJ*)(*C)->data;
4310:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4311:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4312:     Ccsr  = new CsrMatrix;
4313:     Cmat->cprowIndices      = NULL;
4314:     c->compressedrow.use    = PETSC_FALSE;
4315:     c->compressedrow.nrows  = 0;
4316:     c->compressedrow.i      = NULL;
4317:     c->compressedrow.rindex = NULL;
4318:     Ccusp->workVector       = NULL;
4319:     Ccusp->nrows    = m;
4320:     Ccusp->mat      = Cmat;
4321:     Ccusp->mat->mat = Ccsr;
4322:     Ccsr->num_rows  = m;
4323:     Ccsr->num_cols  = n;
4324:     cusparseCreateMatDescr(&Cmat->descr);
4325:     cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
4326:     cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4327:     cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));
4328:     cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));
4329:     cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4330:     cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4331:     cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
4332:     cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4333:     MatSeqAIJCUSPARSECopyToGPU(A);
4334:     MatSeqAIJCUSPARSECopyToGPU(B);

4338:     Acsr = (CsrMatrix*)Acusp->mat->mat;
4339:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4340:     Annz = (PetscInt)Acsr->column_indices->size();
4341:     Bnnz = (PetscInt)Bcsr->column_indices->size();
4342:     c->nz = Annz + Bnnz;
4343:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4344:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4345:     Ccsr->values = new THRUSTARRAY(c->nz);
4346:     Ccsr->num_entries = c->nz;
4347:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4348:     if (c->nz) {
4349:       auto Acoo = new THRUSTINTARRAY32(Annz);
4350:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4351:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4352:       THRUSTINTARRAY32 *Aroff,*Broff;

4354:       if (a->compressedrow.use) { /* need full row offset */
4355:         if (!Acusp->rowoffsets_gpu) {
4356:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4357:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4358:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4359:         }
4360:         Aroff = Acusp->rowoffsets_gpu;
4361:       } else Aroff = Acsr->row_offsets;
4362:       if (b->compressedrow.use) { /* need full row offset */
4363:         if (!Bcusp->rowoffsets_gpu) {
4364:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4365:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4366:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4367:         }
4368:         Broff = Bcusp->rowoffsets_gpu;
4369:       } else Broff = Bcsr->row_offsets;
4370:       PetscLogGpuTimeBegin();
4371:       stat = cusparseXcsr2coo(Acusp->handle,
4372:                               Aroff->data().get(),
4373:                               Annz,
4374:                               m,
4375:                               Acoo->data().get(),
4376:                               CUSPARSE_INDEX_BASE_ZERO);stat;
4377:       stat = cusparseXcsr2coo(Bcusp->handle,
4378:                               Broff->data().get(),
4379:                               Bnnz,
4380:                               m,
4381:                               Bcoo->data().get(),
4382:                               CUSPARSE_INDEX_BASE_ZERO);stat;
4383:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4384:       auto Aperm = thrust::make_constant_iterator(1);
4385:       auto Bperm = thrust::make_constant_iterator(0);
4386: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4387:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4388:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4389: #else
4390:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4391:       auto Bcib = Bcsr->column_indices->begin();
4392:       auto Bcie = Bcsr->column_indices->end();
4393:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4394: #endif
4395:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4396:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4397:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4398:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4399:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4400:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4401:       auto p1 = Ccusp->cooPerm->begin();
4402:       auto p2 = Ccusp->cooPerm->begin();
4403:       thrust::advance(p2,Annz);
4404:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4405: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4406:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4407: #endif
4408:       auto cci = thrust::make_counting_iterator(zero);
4409:       auto cce = thrust::make_counting_iterator(c->nz);
4410: #if 0 //Errors on SUMMIT cuda 11.1.0
4411:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4412: #else
4413:       auto pred = thrust::identity<int>();
4414:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4415:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4416: #endif
4417:       stat = cusparseXcoo2csr(Ccusp->handle,
4418:                               Ccoo->data().get(),
4419:                               c->nz,
4420:                               m,
4421:                               Ccsr->row_offsets->data().get(),
4422:                               CUSPARSE_INDEX_BASE_ZERO);stat;
4423:       PetscLogGpuTimeEnd();
4424:       delete wPerm;
4425:       delete Acoo;
4426:       delete Bcoo;
4427:       delete Ccoo;
4428: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4429:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4430:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4431:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4432:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
4433: #endif
4434:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4435:         MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4436:         MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4437:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4438:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4439:         CsrMatrix *CcsrT = new CsrMatrix;
4440:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4441:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4443:         (*C)->form_explicit_transpose = PETSC_TRUE;
4444:         (*C)->transupdated = PETSC_TRUE;
4445:         Ccusp->rowoffsets_gpu = NULL;
4446:         CmatT->cprowIndices = NULL;
4447:         CmatT->mat = CcsrT;
4448:         CcsrT->num_rows = n;
4449:         CcsrT->num_cols = m;
4450:         CcsrT->num_entries = c->nz;

4452:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4453:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4454:         CcsrT->values = new THRUSTARRAY(c->nz);

4456:         PetscLogGpuTimeBegin();
4457:         auto rT = CcsrT->row_offsets->begin();
4458:         if (AT) {
4459:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4460:           thrust::advance(rT,-1);
4461:         }
4462:         if (BT) {
4463:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4464:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4465:           thrust::copy(titb,tite,rT);
4466:         }
4467:         auto cT = CcsrT->column_indices->begin();
4468:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4469:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4470:         auto vT = CcsrT->values->begin();
4471:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4472:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4473:         PetscLogGpuTimeEnd();

4475:         cusparseCreateMatDescr(&CmatT->descr);
4476:         cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);
4477:         cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4478:         cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));
4479:         cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));
4480:         cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4481:         cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4482:         cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);
4483:         cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);
4484: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4485:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4486:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4487:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4488:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);stat;
4489: #endif
4490:         Ccusp->matTranspose = CmatT;
4491:       }
4492:     }

4494:     c->singlemalloc = PETSC_FALSE;
4495:     c->free_a       = PETSC_TRUE;
4496:     c->free_ij      = PETSC_TRUE;
4497:     PetscMalloc1(m+1,&c->i);
4498:     PetscMalloc1(c->nz,&c->j);
4499:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4500:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4501:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4502:       ii   = *Ccsr->row_offsets;
4503:       jj   = *Ccsr->column_indices;
4504:       cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4505:       cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4506:     } else {
4507:       cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4508:       cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);
4509:     }
4510:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4511:     PetscMalloc1(m,&c->ilen);
4512:     PetscMalloc1(m,&c->imax);
4513:     c->maxnz = c->nz;
4514:     c->nonzerorowcnt = 0;
4515:     c->rmax = 0;
4516:     for (i = 0; i < m; i++) {
4517:       const PetscInt nn = c->i[i+1] - c->i[i];
4518:       c->ilen[i] = c->imax[i] = nn;
4519:       c->nonzerorowcnt += (PetscInt)!!nn;
4520:       c->rmax = PetscMax(c->rmax,nn);
4521:     }
4522:     MatMarkDiagonal_SeqAIJ(*C);
4523:     PetscMalloc1(c->nz,&c->a);
4524:     (*C)->nonzerostate++;
4525:     PetscLayoutSetUp((*C)->rmap);
4526:     PetscLayoutSetUp((*C)->cmap);
4527:     Ccusp->nonzerostate = (*C)->nonzerostate;
4528:     (*C)->preallocated  = PETSC_TRUE;
4529:   } else {
4531:     c = (Mat_SeqAIJ*)(*C)->data;
4532:     if (c->nz) {
4533:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4537:       MatSeqAIJCUSPARSECopyToGPU(A);
4538:       MatSeqAIJCUSPARSECopyToGPU(B);
4541:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4542:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4543:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4549:       auto pmid = Ccusp->cooPerm->begin();
4550:       thrust::advance(pmid,Acsr->num_entries);
4551:       PetscLogGpuTimeBegin();
4552:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4553:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4554:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4555:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4556:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4557:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4558:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4559:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4560:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4561:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4562:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4563:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4565:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4566:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4567:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4568:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4569:         auto vT = CcsrT->values->begin();
4570:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4571:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4572:         (*C)->transupdated = PETSC_TRUE;
4573:       }
4574:       PetscLogGpuTimeEnd();
4575:     }
4576:   }
4577:   PetscObjectStateIncrease((PetscObject)*C);
4578:   (*C)->assembled     = PETSC_TRUE;
4579:   (*C)->was_assembled = PETSC_FALSE;
4580:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4581:   return 0;
4582: }

4584: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4585: {
4586:   bool              dmem;
4587:   const PetscScalar *av;

4589:   dmem = isCudaMem(v);
4590:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4591:   if (n && idx) {
4592:     THRUSTINTARRAY widx(n);
4593:     widx.assign(idx,idx+n);
4594:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4596:     THRUSTARRAY *w = NULL;
4597:     thrust::device_ptr<PetscScalar> dv;
4598:     if (dmem) {
4599:       dv = thrust::device_pointer_cast(v);
4600:     } else {
4601:       w = new THRUSTARRAY(n);
4602:       dv = w->data();
4603:     }
4604:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4606:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4607:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4608:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4609:     if (w) {
4610:       cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
4611:     }
4612:     delete w;
4613:   } else {
4614:     cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
4615:   }
4616:   if (!dmem) PetscLogCpuToGpu(n*sizeof(PetscScalar));
4617:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4618:   return 0;
4619: }