Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 15: #include <thrust/async/for_each.h>

 17: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 19:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 20:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 22:   typedef enum {
 23:       CUSPARSE_MV_ALG_DEFAULT = 0,
 24:       CUSPARSE_COOMV_ALG      = 1,
 25:       CUSPARSE_CSRMV_ALG1     = 2,
 26:       CUSPARSE_CSRMV_ALG2     = 3
 27:   } cusparseSpMVAlg_t;

 29:   typedef enum {
 30:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 31:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 32:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 33:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 34:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 35:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 36:       CUSPARSE_SPMM_COO_ALG1    = 1,
 37:       CUSPARSE_SPMM_COO_ALG2    = 2,
 38:       CUSPARSE_SPMM_COO_ALG3    = 3,
 39:       CUSPARSE_SPMM_COO_ALG4    = 5,
 40:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 41:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 42:   } cusparseSpMMAlg_t;

 44:   typedef enum {
 45:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 46:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 47:   } cusparseCsr2CscAlg_t;
 48:   */
 49:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 50:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 51:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 52: #endif

 54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 67: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 68: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 69: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 71: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 72: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 73: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 74: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 75: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 77: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 78: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 79: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 80: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 81: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 83: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
 84: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 85: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 87: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 88: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 90: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);

 92: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 93: {
 94:   cusparseStatus_t   stat;
 95:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 98:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
 99:   cusparsestruct->stream = stream;
100:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101:   return(0);
102: }

104: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105: {
106:   cusparseStatus_t   stat;
107:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

110:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111:   if (cusparsestruct->handle != handle) {
112:     if (cusparsestruct->handle) {
113:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114:     }
115:     cusparsestruct->handle = handle;
116:   }
117:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118:   return(0);
119: }

121: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122: {
123:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124:   PetscBool          flg;
125:   PetscErrorCode     ierr;

128:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
129:   if (!flg || !cusparsestruct) return(0);
130:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131:   return(0);
132: }

134: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135: {
137:   *type = MATSOLVERCUSPARSE;
138:   return(0);
139: }

141: /*MC
142:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147:   algorithms are not recommended. This class does NOT support direct solver operations.

149:   Level: beginner

151: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152: M*/

154: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155: {
157:   PetscInt       n = A->rmap->n;

160:   MatCreate(PetscObjectComm((PetscObject)A),B);
161:   MatSetSizes(*B,n,n,n,n);
162:   (*B)->factortype = ftype;
163:   MatSetType(*B,MATSEQAIJCUSPARSE);

165:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166:     MatSetBlockSizesFromMats(*B,A,A);
167:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
170:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
171:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
172:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
176:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
177:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

179:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
180:   (*B)->canuseordering = PETSC_TRUE;
181:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
182:   return(0);
183: }

185: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186: {
187:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

190:   switch (op) {
191:   case MAT_CUSPARSE_MULT:
192:     cusparsestruct->format = format;
193:     break;
194:   case MAT_CUSPARSE_ALL:
195:     cusparsestruct->format = format;
196:     break;
197:   default:
198:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199:   }
200:   return(0);
201: }

203: /*@
204:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205:    operation. Only the MatMult operation can use different GPU storage formats
206:    for MPIAIJCUSPARSE matrices.
207:    Not Collective

209:    Input Parameters:
210: +  A - Matrix of type SEQAIJCUSPARSE
211: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

214:    Output Parameter:

216:    Level: intermediate

218: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219: @*/
220: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221: {

226:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
227:   return(0);
228: }

230: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231: {

235:   switch (op) {
236:     case MAT_FORM_EXPLICIT_TRANSPOSE:
237:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238:       if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
239:       A->form_explicit_transpose = flg;
240:       break;
241:     default:
242:       MatSetOption_SeqAIJ(A,op,flg);
243:       break;
244:   }
245:   return(0);
246: }

248: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

250: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251: {
252:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253:   IS             isrow = b->row,iscol = b->col;
254:   PetscBool      row_identity,col_identity;

258:   MatSeqAIJCUSPARSECopyFromGPU(A);
259:   MatLUFactorNumeric_SeqAIJ(B,A,info);
260:   B->offloadmask = PETSC_OFFLOAD_CPU;
261:   /* determine which version of MatSolve needs to be used. */
262:   ISIdentity(isrow,&row_identity);
263:   ISIdentity(iscol,&col_identity);
264:   if (row_identity && col_identity) {
265:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267:     B->ops->matsolve = NULL;
268:     B->ops->matsolvetranspose = NULL;
269:   } else {
270:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272:     B->ops->matsolve = NULL;
273:     B->ops->matsolvetranspose = NULL;
274:   }

276:   /* get the triangular factors */
277:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
278:   return(0);
279: }

281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283:   PetscErrorCode           ierr;
284:   MatCUSPARSEStorageFormat format;
285:   PetscBool                flg;
286:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

289:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
290:   if (A->factortype == MAT_FACTOR_NONE) {
291:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
293:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

295:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
297:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
298:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
301:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303:     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304: #else
305:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306: #endif
307:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
309:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

311:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
313:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314:    #endif
315:   }
316:   PetscOptionsTail();
317:   return(0);
318: }

320: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321: {
322:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323:   PetscErrorCode               ierr;

326:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
327:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
328:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329:   return(0);
330: }

332: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333: {
334:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335:   PetscErrorCode               ierr;

338:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
339:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
340:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341:   return(0);
342: }

344: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345: {
346:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347:   PetscErrorCode               ierr;

350:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
351:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
352:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353:   return(0);
354: }

356: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357: {
358:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359:   PetscErrorCode               ierr;

362:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
363:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
364:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365:   return(0);
366: }

368: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369: {
370:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
371:   PetscInt                          n = A->rmap->n;
372:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374:   cusparseStatus_t                  stat;
375:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
376:   const MatScalar                   *aa = a->a,*v;
377:   PetscInt                          *AiLo, *AjLo;
378:   PetscInt                          i,nz, nzLower, offset, rowOffset;
379:   PetscErrorCode                    ierr;
380:   cudaError_t                       cerr;

383:   if (!n) return(0);
384:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385:     try {
386:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387:       nzLower=n+ai[n]-ai[1];
388:       if (!loTriFactor) {
389:         PetscScalar                       *AALo;

391:         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

393:         /* Allocate Space for the lower triangular matrix */
394:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

397:         /* Fill the lower triangular matrix */
398:         AiLo[0]  = (PetscInt) 0;
399:         AiLo[n]  = nzLower;
400:         AjLo[0]  = (PetscInt) 0;
401:         AALo[0]  = (MatScalar) 1.0;
402:         v        = aa;
403:         vi       = aj;
404:         offset   = 1;
405:         rowOffset= 1;
406:         for (i=1; i<n; i++) {
407:           nz = ai[i+1] - ai[i];
408:           /* additional 1 for the term on the diagonal */
409:           AiLo[i]    = rowOffset;
410:           rowOffset += nz+1;

412:           PetscArraycpy(&(AjLo[offset]), vi, nz);
413:           PetscArraycpy(&(AALo[offset]), v, nz);

415:           offset      += nz;
416:           AjLo[offset] = (PetscInt) i;
417:           AALo[offset] = (MatScalar) 1.0;
418:           offset      += 1;

420:           v  += nz;
421:           vi += nz;
422:         }

424:         /* allocate space for the triangular factor information */
425:         PetscNew(&loTriFactor);
426:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427:         /* Create the matrix description */
428:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432:        #else
433:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434:        #endif
435:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

438:         /* set the operation */
439:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

441:         /* set the matrix */
442:         loTriFactor->csrMat = new CsrMatrix;
443:         loTriFactor->csrMat->num_rows = n;
444:         loTriFactor->csrMat->num_cols = n;
445:         loTriFactor->csrMat->num_entries = nzLower;

447:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

450:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

453:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

456:         /* Create the solve analysis information */
457:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
458:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466:       #endif

468:         /* perform the solve analysis */
469:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472:                                  loTriFactor->csrMat->column_indices->data().get(),
473:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474:                                  loTriFactor->solveInfo,
475:                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476:                                #else
477:                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478:                                #endif
479:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

482:         /* assign the pointer */
483:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484:         loTriFactor->AA_h = AALo;
485:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
488:       } else { /* update values only */
489:         if (!loTriFactor->AA_h) {
490:           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491:         }
492:         /* Fill the lower triangular matrix */
493:         loTriFactor->AA_h[0]  = 1.0;
494:         v        = aa;
495:         vi       = aj;
496:         offset   = 1;
497:         for (i=1; i<n; i++) {
498:           nz = ai[i+1] - ai[i];
499:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
500:           offset      += nz;
501:           loTriFactor->AA_h[offset] = 1.0;
502:           offset      += 1;
503:           v  += nz;
504:         }
505:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
507:       }
508:     } catch(char *ex) {
509:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510:     }
511:   }
512:   return(0);
513: }

515: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516: {
517:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518:   PetscInt                          n = A->rmap->n;
519:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521:   cusparseStatus_t                  stat;
522:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
523:   const MatScalar                   *aa = a->a,*v;
524:   PetscInt                          *AiUp, *AjUp;
525:   PetscInt                          i,nz, nzUpper, offset;
526:   PetscErrorCode                    ierr;
527:   cudaError_t                       cerr;

530:   if (!n) return(0);
531:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532:     try {
533:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534:       nzUpper = adiag[0]-adiag[n];
535:       if (!upTriFactor) {
536:         PetscScalar *AAUp;

538:         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

540:         /* Allocate Space for the upper triangular matrix */
541:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

544:         /* Fill the upper triangular matrix */
545:         AiUp[0]=(PetscInt) 0;
546:         AiUp[n]=nzUpper;
547:         offset = nzUpper;
548:         for (i=n-1; i>=0; i--) {
549:           v  = aa + adiag[i+1] + 1;
550:           vi = aj + adiag[i+1] + 1;

552:           /* number of elements NOT on the diagonal */
553:           nz = adiag[i] - adiag[i+1]-1;

555:           /* decrement the offset */
556:           offset -= (nz+1);

558:           /* first, set the diagonal elements */
559:           AjUp[offset] = (PetscInt) i;
560:           AAUp[offset] = (MatScalar)1./v[nz];
561:           AiUp[i]      = AiUp[i+1] - (nz+1);

563:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
564:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
565:         }

567:         /* allocate space for the triangular factor information */
568:         PetscNew(&upTriFactor);
569:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

571:         /* Create the matrix description */
572:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576:        #else
577:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578:        #endif
579:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

582:         /* set the operation */
583:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

585:         /* set the matrix */
586:         upTriFactor->csrMat = new CsrMatrix;
587:         upTriFactor->csrMat->num_rows = n;
588:         upTriFactor->csrMat->num_cols = n;
589:         upTriFactor->csrMat->num_entries = nzUpper;

591:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

594:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

597:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

600:         /* Create the solve analysis information */
601:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
602:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610:       #endif

612:         /* perform the solve analysis */
613:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616:                                  upTriFactor->csrMat->column_indices->data().get(),
617:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618:                                  upTriFactor->solveInfo,
619:                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620:                                #else
621:                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622:                                #endif
623:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

626:         /* assign the pointer */
627:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628:         upTriFactor->AA_h = AAUp;
629:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
632:       } else {
633:         if (!upTriFactor->AA_h) {
634:           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635:         }
636:         /* Fill the upper triangular matrix */
637:         offset = nzUpper;
638:         for (i=n-1; i>=0; i--) {
639:           v  = aa + adiag[i+1] + 1;

641:           /* number of elements NOT on the diagonal */
642:           nz = adiag[i] - adiag[i+1]-1;

644:           /* decrement the offset */
645:           offset -= (nz+1);

647:           /* first, set the diagonal elements */
648:           upTriFactor->AA_h[offset] = 1./v[nz];
649:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
650:         }
651:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
653:       }
654:     } catch(char *ex) {
655:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656:     }
657:   }
658:   return(0);
659: }

661: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662: {
663:   PetscErrorCode               ierr;
664:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
665:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666:   IS                           isrow = a->row,iscol = a->icol;
667:   PetscBool                    row_identity,col_identity;
668:   PetscInt                     n = A->rmap->n;

671:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
673:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

675:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676:   cusparseTriFactors->nnz=a->nz;

678:   A->offloadmask = PETSC_OFFLOAD_BOTH;
679:   /* lower triangular indices */
680:   ISIdentity(isrow,&row_identity);
681:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682:     const PetscInt *r;

684:     ISGetIndices(isrow,&r);
685:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686:     cusparseTriFactors->rpermIndices->assign(r, r+n);
687:     ISRestoreIndices(isrow,&r);
688:     PetscLogCpuToGpu(n*sizeof(PetscInt));
689:   }

691:   /* upper triangular indices */
692:   ISIdentity(iscol,&col_identity);
693:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694:     const PetscInt *c;

696:     ISGetIndices(iscol,&c);
697:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698:     cusparseTriFactors->cpermIndices->assign(c, c+n);
699:     ISRestoreIndices(iscol,&c);
700:     PetscLogCpuToGpu(n*sizeof(PetscInt));
701:   }
702:   return(0);
703: }

705: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706: {
707:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711:   cusparseStatus_t                  stat;
712:   PetscErrorCode                    ierr;
713:   cudaError_t                       cerr;
714:   PetscInt                          *AiUp, *AjUp;
715:   PetscScalar                       *AAUp;
716:   PetscScalar                       *AALo;
717:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720:   const MatScalar                   *aa = b->a,*v;

723:   if (!n) return(0);
724:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725:     try {
726:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728:       if (!upTriFactor && !loTriFactor) {
729:         /* Allocate Space for the upper triangular matrix */
730:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

733:         /* Fill the upper triangular matrix */
734:         AiUp[0]=(PetscInt) 0;
735:         AiUp[n]=nzUpper;
736:         offset = 0;
737:         for (i=0; i<n; i++) {
738:           /* set the pointers */
739:           v  = aa + ai[i];
740:           vj = aj + ai[i];
741:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

743:           /* first, set the diagonal elements */
744:           AjUp[offset] = (PetscInt) i;
745:           AAUp[offset] = (MatScalar)1.0/v[nz];
746:           AiUp[i]      = offset;
747:           AALo[offset] = (MatScalar)1.0/v[nz];

749:           offset+=1;
750:           if (nz>0) {
751:             PetscArraycpy(&(AjUp[offset]), vj, nz);
752:             PetscArraycpy(&(AAUp[offset]), v, nz);
753:             for (j=offset; j<offset+nz; j++) {
754:               AAUp[j] = -AAUp[j];
755:               AALo[j] = AAUp[j]/v[nz];
756:             }
757:             offset+=nz;
758:           }
759:         }

761:         /* allocate space for the triangular factor information */
762:         PetscNew(&upTriFactor);
763:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

765:         /* Create the matrix description */
766:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770:        #else
771:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772:        #endif
773:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

776:         /* set the matrix */
777:         upTriFactor->csrMat = new CsrMatrix;
778:         upTriFactor->csrMat->num_rows = A->rmap->n;
779:         upTriFactor->csrMat->num_cols = A->cmap->n;
780:         upTriFactor->csrMat->num_entries = a->nz;

782:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

785:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

788:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

791:         /* set the operation */
792:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

794:         /* Create the solve analysis information */
795:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
796:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804:       #endif

806:         /* perform the solve analysis */
807:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810:                                  upTriFactor->csrMat->column_indices->data().get(),
811:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812:                                  upTriFactor->solveInfo,
813:                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814:                                 #else
815:                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816:                                 #endif
817:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

820:         /* assign the pointer */
821:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

823:         /* allocate space for the triangular factor information */
824:         PetscNew(&loTriFactor);
825:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

827:         /* Create the matrix description */
828:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832:        #else
833:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834:        #endif
835:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

838:         /* set the operation */
839:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

841:         /* set the matrix */
842:         loTriFactor->csrMat = new CsrMatrix;
843:         loTriFactor->csrMat->num_rows = A->rmap->n;
844:         loTriFactor->csrMat->num_cols = A->cmap->n;
845:         loTriFactor->csrMat->num_entries = a->nz;

847:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

850:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

853:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

856:         /* Create the solve analysis information */
857:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
858:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866:       #endif

868:         /* perform the solve analysis */
869:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872:                                  loTriFactor->csrMat->column_indices->data().get(),
873:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874:                                  loTriFactor->solveInfo,
875:                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876:                                 #else
877:                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878:                                 #endif
879:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

882:         /* assign the pointer */
883:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

885:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
886:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888:       } else {
889:         /* Fill the upper triangular matrix */
890:         offset = 0;
891:         for (i=0; i<n; i++) {
892:           /* set the pointers */
893:           v  = aa + ai[i];
894:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

896:           /* first, set the diagonal elements */
897:           AAUp[offset] = 1.0/v[nz];
898:           AALo[offset] = 1.0/v[nz];

900:           offset+=1;
901:           if (nz>0) {
902:             PetscArraycpy(&(AAUp[offset]), v, nz);
903:             for (j=offset; j<offset+nz; j++) {
904:               AAUp[j] = -AAUp[j];
905:               AALo[j] = AAUp[j]/v[nz];
906:             }
907:             offset+=nz;
908:           }
909:         }
910:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
915:       }
916:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918:     } catch(char *ex) {
919:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920:     }
921:   }
922:   return(0);
923: }

925: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926: {
927:   PetscErrorCode               ierr;
928:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930:   IS                           ip = a->row;
931:   PetscBool                    perm_identity;
932:   PetscInt                     n = A->rmap->n;

935:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
937:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

940:   A->offloadmask = PETSC_OFFLOAD_BOTH;

942:   /* lower triangular indices */
943:   ISIdentity(ip,&perm_identity);
944:   if (!perm_identity) {
945:     IS             iip;
946:     const PetscInt *irip,*rip;

948:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
949:     ISGetIndices(iip,&irip);
950:     ISGetIndices(ip,&rip);
951:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955:     ISRestoreIndices(iip,&irip);
956:     ISDestroy(&iip);
957:     ISRestoreIndices(ip,&rip);
958:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
959:   }
960:   return(0);
961: }

963: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964: {
965:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966:   IS             ip = b->row;
967:   PetscBool      perm_identity;

971:   MatSeqAIJCUSPARSECopyFromGPU(A);
972:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
973:   B->offloadmask = PETSC_OFFLOAD_CPU;
974:   /* determine which version of MatSolve needs to be used. */
975:   ISIdentity(ip,&perm_identity);
976:   if (perm_identity) {
977:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979:     B->ops->matsolve = NULL;
980:     B->ops->matsolvetranspose = NULL;
981:   } else {
982:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984:     B->ops->matsolve = NULL;
985:     B->ops->matsolvetranspose = NULL;
986:   }

988:   /* get the triangular factors */
989:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
990:   return(0);
991: }

993: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994: {
995:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000:   cusparseStatus_t                  stat;
1001:   cusparseIndexBase_t               indexBase;
1002:   cusparseMatrixType_t              matrixType;
1003:   cusparseFillMode_t                fillMode;
1004:   cusparseDiagType_t                diagType;
1005:   cudaError_t                       cerr;
1006:   PetscErrorCode                    ierr;

1009:   /* allocate space for the transpose of the lower triangular factor */
1010:   PetscNew(&loTriFactorT);
1011:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1013:   /* set the matrix descriptors of the lower triangular factor */
1014:   matrixType = cusparseGetMatType(loTriFactor->descr);
1015:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1020:   /* Create the matrix description */
1021:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1027:   /* set the operation */
1028:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1030:   /* allocate GPU space for the CSC of the lower triangular factor*/
1031:   loTriFactorT->csrMat = new CsrMatrix;
1032:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1039:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043:                                        loTriFactor->csrMat->values->data().get(),
1044:                                        loTriFactor->csrMat->row_offsets->data().get(),
1045:                                        loTriFactor->csrMat->column_indices->data().get(),
1046:                                        loTriFactorT->csrMat->values->data().get(),
1047:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051: #endif

1053:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1054:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056:                           loTriFactor->csrMat->values->data().get(),
1057:                           loTriFactor->csrMat->row_offsets->data().get(),
1058:                           loTriFactor->csrMat->column_indices->data().get(),
1059:                           loTriFactorT->csrMat->values->data().get(),
1060:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064:                         #else
1065:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067:                         #endif
1068:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1071:   /* Create the solve analysis information */
1072:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1073:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081: #endif

1083:   /* perform the solve analysis */
1084:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087:                            loTriFactorT->csrMat->column_indices->data().get(),
1088:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089:                            loTriFactorT->solveInfo,
1090:                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091:                           #else
1092:                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093:                           #endif
1094:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1097:   /* assign the pointer */
1098:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1100:   /*********************************************/
1101:   /* Now the Transpose of the Upper Tri Factor */
1102:   /*********************************************/

1104:   /* allocate space for the transpose of the upper triangular factor */
1105:   PetscNew(&upTriFactorT);
1106:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1108:   /* set the matrix descriptors of the upper triangular factor */
1109:   matrixType = cusparseGetMatType(upTriFactor->descr);
1110:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1115:   /* Create the matrix description */
1116:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1122:   /* set the operation */
1123:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1125:   /* allocate GPU space for the CSC of the upper triangular factor*/
1126:   upTriFactorT->csrMat = new CsrMatrix;
1127:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1134:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138:                                 upTriFactor->csrMat->values->data().get(),
1139:                                 upTriFactor->csrMat->row_offsets->data().get(),
1140:                                 upTriFactor->csrMat->column_indices->data().get(),
1141:                                 upTriFactorT->csrMat->values->data().get(),
1142:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146: #endif

1148:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1149:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151:                           upTriFactor->csrMat->values->data().get(),
1152:                           upTriFactor->csrMat->row_offsets->data().get(),
1153:                           upTriFactor->csrMat->column_indices->data().get(),
1154:                           upTriFactorT->csrMat->values->data().get(),
1155:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159:                         #else
1160:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162:                         #endif

1164:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1167:   /* Create the solve analysis information */
1168:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1169:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177:   #endif

1179:   /* perform the solve analysis */
1180:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183:                            upTriFactorT->csrMat->column_indices->data().get(),
1184:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185:                            upTriFactorT->solveInfo,
1186:                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187:                           #else
1188:                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189:                           #endif

1191:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1194:   /* assign the pointer */
1195:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196:   return(0);
1197: }

1199: struct PetscScalarToPetscInt
1200: {
1201:   __host__ __device__
1202:   PetscInt operator()(PetscScalar s)
1203:   {
1204:     return (PetscInt)PetscRealPart(s);
1205:   }
1206: };

1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1209: {
1210:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213:   cusparseStatus_t             stat;
1214:   cusparseIndexBase_t          indexBase;
1215:   cudaError_t                  err;
1216:   PetscErrorCode               ierr;

1219:   MatSeqAIJCUSPARSECopyToGPU(A);
1220:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1221:   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1222:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1223:   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1224:   if (A->transupdated) return(0);
1225:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1226:   PetscLogGpuTimeBegin();
1227:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229:   }
1230:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232:     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234:     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235:     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1237:     /* set alpha and beta */
1238:     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239:     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240:     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241:     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242:     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243:     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

1245:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246:       CsrMatrix *matrixT = new CsrMatrix;
1247:       matstructT->mat = matrixT;
1248:       matrixT->num_rows = A->cmap->n;
1249:       matrixT->num_cols = A->rmap->n;
1250:       matrixT->num_entries = a->nz;
1251:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253:       matrixT->values = new THRUSTARRAY(a->nz);

1255:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1258:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259:       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1260:         stat = cusparseCreateCsr(&matstructT->matDescr,
1261:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263:                                matrixT->values->data().get(),
1264:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265:                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266:       #else
1267:         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1268:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1270:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1271:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1272:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1273:         */
1274:         if (matrixT->num_entries) {
1275:           stat = cusparseCreateCsr(&matstructT->matDescr,
1276:                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277:                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278:                                  matrixT->values->data().get(),
1279:                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1280:                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);

1282:         } else {
1283:           matstructT->matDescr = NULL;
1284:           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1285:         }
1286:       #endif
1287:      #endif
1288:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1289:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1291:    #else
1292:       CsrMatrix *temp  = new CsrMatrix;
1293:       CsrMatrix *tempT = new CsrMatrix;
1294:       /* First convert HYB to CSR */
1295:       temp->num_rows = A->rmap->n;
1296:       temp->num_cols = A->cmap->n;
1297:       temp->num_entries = a->nz;
1298:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1300:       temp->values = new THRUSTARRAY(a->nz);

1302:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1303:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1304:                               temp->values->data().get(),
1305:                               temp->row_offsets->data().get(),
1306:                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1308:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1309:       tempT->num_rows = A->rmap->n;
1310:       tempT->num_cols = A->cmap->n;
1311:       tempT->num_entries = a->nz;
1312:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1313:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1314:       tempT->values = new THRUSTARRAY(a->nz);

1316:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1317:                               temp->num_cols, temp->num_entries,
1318:                               temp->values->data().get(),
1319:                               temp->row_offsets->data().get(),
1320:                               temp->column_indices->data().get(),
1321:                               tempT->values->data().get(),
1322:                               tempT->column_indices->data().get(),
1323:                               tempT->row_offsets->data().get(),
1324:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1326:       /* Last, convert CSC to HYB */
1327:       cusparseHybMat_t hybMat;
1328:       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1329:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1330:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1331:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1332:                               matstructT->descr, tempT->values->data().get(),
1333:                               tempT->row_offsets->data().get(),
1334:                               tempT->column_indices->data().get(),
1335:                               hybMat, 0, partition);CHKERRCUSPARSE(stat);

1337:       /* assign the pointer */
1338:       matstructT->mat = hybMat;
1339:       A->transupdated = PETSC_TRUE;
1340:       /* delete temporaries */
1341:       if (tempT) {
1342:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1343:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1344:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1345:         delete (CsrMatrix*) tempT;
1346:       }
1347:       if (temp) {
1348:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1349:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1350:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1351:         delete (CsrMatrix*) temp;
1352:       }
1353:      #endif
1354:     }
1355:   }
1356:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1357:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1358:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1359:     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1360:     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1361:     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1362:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1363:     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1364:     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1365:     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1366:     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1367:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1368:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1369:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1370:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1371:     }
1372:     if (!cusparsestruct->csr2csc_i) {
1373:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1374:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1376:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1377:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378:       void   *csr2cscBuffer;
1379:       size_t csr2cscBufferSize;
1380:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1381:                                            A->cmap->n, matrix->num_entries,
1382:                                            matrix->values->data().get(),
1383:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1384:                                            matrix->column_indices->data().get(),
1385:                                            matrixT->values->data().get(),
1386:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1388:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1389:       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1390:      #endif

1392:       if (matrix->num_entries) {
1393:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1394:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1395:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1397:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1398:            should be filled with indexBase. So I just take a shortcut here.
1399:         */
1400:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1401:                               A->cmap->n,matrix->num_entries,
1402:                               csr2csc_a.data().get(),
1403:                               cusparsestruct->rowoffsets_gpu->data().get(),
1404:                               matrix->column_indices->data().get(),
1405:                               matrixT->values->data().get(),
1406:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1408:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1409:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1410:                              #else
1411:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1412:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1413:                              #endif
1414:       } else {
1415:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1416:       }

1418:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1419:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1420:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1421:       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1422:      #endif
1423:     }
1424:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1425:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1426:                                                      matrixT->values->begin()));
1427:   }
1428:   PetscLogGpuTimeEnd();
1429:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1430:   /* the compressed row indices is not used for matTranspose */
1431:   matstructT->cprowIndices = NULL;
1432:   /* assign the pointer */
1433:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1434:   A->transupdated = PETSC_TRUE;
1435:   return(0);
1436: }

1438: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1439: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1440: {
1441:   PetscInt                              n = xx->map->n;
1442:   const PetscScalar                     *barray;
1443:   PetscScalar                           *xarray;
1444:   thrust::device_ptr<const PetscScalar> bGPU;
1445:   thrust::device_ptr<PetscScalar>       xGPU;
1446:   cusparseStatus_t                      stat;
1447:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1448:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1449:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1450:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1451:   PetscErrorCode                        ierr;

1454:   /* Analyze the matrix and create the transpose ... on the fly */
1455:   if (!loTriFactorT && !upTriFactorT) {
1456:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1457:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459:   }

1461:   /* Get the GPU pointers */
1462:   VecCUDAGetArrayWrite(xx,&xarray);
1463:   VecCUDAGetArrayRead(bb,&barray);
1464:   xGPU = thrust::device_pointer_cast(xarray);
1465:   bGPU = thrust::device_pointer_cast(barray);

1467:   PetscLogGpuTimeBegin();
1468:   /* First, reorder with the row permutation */
1469:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1470:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1471:                xGPU);

1473:   /* First, solve U */
1474:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1475:                         upTriFactorT->csrMat->num_rows,
1476:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477:                         upTriFactorT->csrMat->num_entries,
1478:                       #endif
1479:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1480:                         upTriFactorT->csrMat->values->data().get(),
1481:                         upTriFactorT->csrMat->row_offsets->data().get(),
1482:                         upTriFactorT->csrMat->column_indices->data().get(),
1483:                         upTriFactorT->solveInfo,
1484:                         xarray,
1485:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486:                         tempGPU->data().get(),
1487:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1488:                       #else
1489:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1490:                       #endif

1492:   /* Then, solve L */
1493:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1494:                         loTriFactorT->csrMat->num_rows,
1495:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1496:                         loTriFactorT->csrMat->num_entries,
1497:                       #endif
1498:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1499:                         loTriFactorT->csrMat->values->data().get(),
1500:                         loTriFactorT->csrMat->row_offsets->data().get(),
1501:                         loTriFactorT->csrMat->column_indices->data().get(),
1502:                         loTriFactorT->solveInfo,
1503:                         tempGPU->data().get(),
1504:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505:                         xarray,
1506:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1507:                       #else
1508:                          xarray);CHKERRCUSPARSE(stat);
1509:                       #endif

1511:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1512:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1513:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1514:                tempGPU->begin());

1516:   /* Copy the temporary to the full solution. */
1517:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1519:   /* restore */
1520:   VecCUDARestoreArrayRead(bb,&barray);
1521:   VecCUDARestoreArrayWrite(xx,&xarray);
1522:   PetscLogGpuTimeEnd();
1523:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1524:   return(0);
1525: }

1527: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1528: {
1529:   const PetscScalar                 *barray;
1530:   PetscScalar                       *xarray;
1531:   cusparseStatus_t                  stat;
1532:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1533:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1534:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1535:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1536:   PetscErrorCode                    ierr;

1539:   /* Analyze the matrix and create the transpose ... on the fly */
1540:   if (!loTriFactorT && !upTriFactorT) {
1541:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1542:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1543:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1544:   }

1546:   /* Get the GPU pointers */
1547:   VecCUDAGetArrayWrite(xx,&xarray);
1548:   VecCUDAGetArrayRead(bb,&barray);

1550:   PetscLogGpuTimeBegin();
1551:   /* First, solve U */
1552:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1553:                         upTriFactorT->csrMat->num_rows,
1554:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555:                         upTriFactorT->csrMat->num_entries,
1556:                       #endif
1557:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1558:                         upTriFactorT->csrMat->values->data().get(),
1559:                         upTriFactorT->csrMat->row_offsets->data().get(),
1560:                         upTriFactorT->csrMat->column_indices->data().get(),
1561:                         upTriFactorT->solveInfo,
1562:                         barray,
1563:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564:                         tempGPU->data().get(),
1565:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566:                       #else
1567:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1568:                       #endif

1570:   /* Then, solve L */
1571:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1572:                         loTriFactorT->csrMat->num_rows,
1573:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1574:                         loTriFactorT->csrMat->num_entries,
1575:                       #endif
1576:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1577:                         loTriFactorT->csrMat->values->data().get(),
1578:                         loTriFactorT->csrMat->row_offsets->data().get(),
1579:                         loTriFactorT->csrMat->column_indices->data().get(),
1580:                         loTriFactorT->solveInfo,
1581:                         tempGPU->data().get(),
1582:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1583:                         xarray,
1584:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1585:                       #else
1586:                         xarray);CHKERRCUSPARSE(stat);
1587:                       #endif

1589:   /* restore */
1590:   VecCUDARestoreArrayRead(bb,&barray);
1591:   VecCUDARestoreArrayWrite(xx,&xarray);
1592:   PetscLogGpuTimeEnd();
1593:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1594:   return(0);
1595: }

1597: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1598: {
1599:   const PetscScalar                     *barray;
1600:   PetscScalar                           *xarray;
1601:   thrust::device_ptr<const PetscScalar> bGPU;
1602:   thrust::device_ptr<PetscScalar>       xGPU;
1603:   cusparseStatus_t                      stat;
1604:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1605:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1606:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1607:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1608:   PetscErrorCode                        ierr;


1612:   /* Get the GPU pointers */
1613:   VecCUDAGetArrayWrite(xx,&xarray);
1614:   VecCUDAGetArrayRead(bb,&barray);
1615:   xGPU = thrust::device_pointer_cast(xarray);
1616:   bGPU = thrust::device_pointer_cast(barray);

1618:   PetscLogGpuTimeBegin();
1619:   /* First, reorder with the row permutation */
1620:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1621:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1622:                tempGPU->begin());

1624:   /* Next, solve L */
1625:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1626:                         loTriFactor->csrMat->num_rows,
1627:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628:                         loTriFactor->csrMat->num_entries,
1629:                       #endif
1630:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1631:                         loTriFactor->csrMat->values->data().get(),
1632:                         loTriFactor->csrMat->row_offsets->data().get(),
1633:                         loTriFactor->csrMat->column_indices->data().get(),
1634:                         loTriFactor->solveInfo,
1635:                         tempGPU->data().get(),
1636:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637:                          xarray,
1638:                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1639:                       #else
1640:                          xarray);CHKERRCUSPARSE(stat);
1641:                       #endif

1643:   /* Then, solve U */
1644:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1645:                         upTriFactor->csrMat->num_rows,
1646:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647:                         upTriFactor->csrMat->num_entries,
1648:                       #endif
1649:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1650:                         upTriFactor->csrMat->values->data().get(),
1651:                         upTriFactor->csrMat->row_offsets->data().get(),
1652:                         upTriFactor->csrMat->column_indices->data().get(),
1653:                         upTriFactor->solveInfo,xarray,
1654:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655:                         tempGPU->data().get(),
1656:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1657:                       #else
1658:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1659:                       #endif

1661:   /* Last, reorder with the column permutation */
1662:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1663:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1664:                xGPU);

1666:   VecCUDARestoreArrayRead(bb,&barray);
1667:   VecCUDARestoreArrayWrite(xx,&xarray);
1668:   PetscLogGpuTimeEnd();
1669:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1670:   return(0);
1671: }

1673: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1674: {
1675:   const PetscScalar                 *barray;
1676:   PetscScalar                       *xarray;
1677:   cusparseStatus_t                  stat;
1678:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1679:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1680:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1681:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1682:   PetscErrorCode                    ierr;

1685:   /* Get the GPU pointers */
1686:   VecCUDAGetArrayWrite(xx,&xarray);
1687:   VecCUDAGetArrayRead(bb,&barray);

1689:   PetscLogGpuTimeBegin();
1690:   /* First, solve L */
1691:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1692:                         loTriFactor->csrMat->num_rows,
1693:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694:                         loTriFactor->csrMat->num_entries,
1695:                       #endif
1696:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1697:                         loTriFactor->csrMat->values->data().get(),
1698:                         loTriFactor->csrMat->row_offsets->data().get(),
1699:                         loTriFactor->csrMat->column_indices->data().get(),
1700:                         loTriFactor->solveInfo,
1701:                         barray,
1702:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1703:                         tempGPU->data().get(),
1704:                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1705:                       #else
1706:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1707:                       #endif

1709:   /* Next, solve U */
1710:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1711:                         upTriFactor->csrMat->num_rows,
1712:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713:                         upTriFactor->csrMat->num_entries,
1714:                       #endif
1715:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1716:                         upTriFactor->csrMat->values->data().get(),
1717:                         upTriFactor->csrMat->row_offsets->data().get(),
1718:                         upTriFactor->csrMat->column_indices->data().get(),
1719:                         upTriFactor->solveInfo,
1720:                         tempGPU->data().get(),
1721:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1722:                         xarray,
1723:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1724:                       #else
1725:                         xarray);CHKERRCUSPARSE(stat);
1726:                       #endif

1728:   VecCUDARestoreArrayRead(bb,&barray);
1729:   VecCUDARestoreArrayWrite(xx,&xarray);
1730:   PetscLogGpuTimeEnd();
1731:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1732:   return(0);
1733: }

1735: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1736: {
1737:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1738:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1739:   cudaError_t        cerr;
1740:   PetscErrorCode     ierr;

1743:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1744:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1746:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1747:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1748:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1749:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1750:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1751:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1752:   }
1753:   return(0);
1754: }

1756: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1757: {
1758:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1762:   MatSeqAIJCUSPARSECopyFromGPU(A);
1763:   *array = a->a;
1764:   A->offloadmask = PETSC_OFFLOAD_CPU;
1765:   return(0);
1766: }

1768: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1769: {
1770:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1771:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1772:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1773:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1774:   PetscErrorCode               ierr;
1775:   cusparseStatus_t             stat;
1776:   PetscBool                    both = PETSC_TRUE;
1777:   cudaError_t                  err;

1780:   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1781:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1782:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1783:       CsrMatrix *matrix;
1784:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1786:       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1787:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1788:       matrix->values->assign(a->a, a->a+a->nz);
1789:       err  = WaitForCUDA();CHKERRCUDA(err);
1790:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1791:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1792:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1793:     } else {
1794:       PetscInt nnz;
1795:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1796:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1797:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1798:       delete cusparsestruct->workVector;
1799:       delete cusparsestruct->rowoffsets_gpu;
1800:       cusparsestruct->workVector = NULL;
1801:       cusparsestruct->rowoffsets_gpu = NULL;
1802:       try {
1803:         if (a->compressedrow.use) {
1804:           m    = a->compressedrow.nrows;
1805:           ii   = a->compressedrow.i;
1806:           ridx = a->compressedrow.rindex;
1807:         } else {
1808:           m    = A->rmap->n;
1809:           ii   = a->i;
1810:           ridx = NULL;
1811:         }
1812:         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1813:         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1814:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1815:         else nnz = a->nz;

1817:         /* create cusparse matrix */
1818:         cusparsestruct->nrows = m;
1819:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1820:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1821:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1822:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1824:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1825:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1826:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1827:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1828:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1829:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1830:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1832:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1833:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1834:           /* set the matrix */
1835:           CsrMatrix *mat= new CsrMatrix;
1836:           mat->num_rows = m;
1837:           mat->num_cols = A->cmap->n;
1838:           mat->num_entries = nnz;
1839:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1840:           mat->row_offsets->assign(ii, ii + m+1);

1842:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1843:           mat->column_indices->assign(a->j, a->j+nnz);

1845:           mat->values = new THRUSTARRAY(nnz);
1846:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1848:           /* assign the pointer */
1849:           matstruct->mat = mat;
1850:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1851:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1852:             stat = cusparseCreateCsr(&matstruct->matDescr,
1853:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1854:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1855:                                     mat->values->data().get(),
1856:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858:           }
1859:          #endif
1860:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1861:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1863:          #else
1864:           CsrMatrix *mat= new CsrMatrix;
1865:           mat->num_rows = m;
1866:           mat->num_cols = A->cmap->n;
1867:           mat->num_entries = nnz;
1868:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1869:           mat->row_offsets->assign(ii, ii + m+1);

1871:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1872:           mat->column_indices->assign(a->j, a->j+nnz);

1874:           mat->values = new THRUSTARRAY(nnz);
1875:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1877:           cusparseHybMat_t hybMat;
1878:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1879:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1880:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1881:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1882:               matstruct->descr, mat->values->data().get(),
1883:               mat->row_offsets->data().get(),
1884:               mat->column_indices->data().get(),
1885:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1886:           /* assign the pointer */
1887:           matstruct->mat = hybMat;

1889:           if (mat) {
1890:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1891:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1892:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1893:             delete (CsrMatrix*)mat;
1894:           }
1895:          #endif
1896:         }

1898:         /* assign the compressed row indices */
1899:         if (a->compressedrow.use) {
1900:           cusparsestruct->workVector = new THRUSTARRAY(m);
1901:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1902:           matstruct->cprowIndices->assign(ridx,ridx+m);
1903:           tmp = m;
1904:         } else {
1905:           cusparsestruct->workVector = NULL;
1906:           matstruct->cprowIndices    = NULL;
1907:           tmp = 0;
1908:         }
1909:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1911:         /* assign the pointer */
1912:         cusparsestruct->mat = matstruct;
1913:       } catch(char *ex) {
1914:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1915:       }
1916:       err  = WaitForCUDA();CHKERRCUDA(err);
1917:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1918:       cusparsestruct->nonzerostate = A->nonzerostate;
1919:     }
1920:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1921:   }
1922:   return(0);
1923: }

1925: struct VecCUDAPlusEquals
1926: {
1927:   template <typename Tuple>
1928:   __host__ __device__
1929:   void operator()(Tuple t)
1930:   {
1931:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1932:   }
1933: };

1935: struct VecCUDAEquals
1936: {
1937:   template <typename Tuple>
1938:   __host__ __device__
1939:   void operator()(Tuple t)
1940:   {
1941:     thrust::get<1>(t) = thrust::get<0>(t);
1942:   }
1943: };

1945: struct VecCUDAEqualsReverse
1946: {
1947:   template <typename Tuple>
1948:   __host__ __device__
1949:   void operator()(Tuple t)
1950:   {
1951:     thrust::get<0>(t) = thrust::get<1>(t);
1952:   }
1953: };

1955: struct MatMatCusparse {
1956:   PetscBool             cisdense;
1957:   PetscScalar           *Bt;
1958:   Mat                   X;
1959:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1960:   PetscLogDouble        flops;
1961:   CsrMatrix             *Bcsr;

1963: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964:   cusparseSpMatDescr_t  matSpBDescr;
1965:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1966:   cusparseDnMatDescr_t  matBDescr;
1967:   cusparseDnMatDescr_t  matCDescr;
1968:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1969:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1970:   void                  *dBuffer4;
1971:   void                  *dBuffer5;
1972:  #endif
1973:   size_t                mmBufferSize;
1974:   void                  *mmBuffer;
1975:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1976:   cusparseSpGEMMDescr_t spgemmDesc;
1977: #endif
1978: };

1980: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1981: {
1982:   PetscErrorCode   ierr;
1983:   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1984:   cudaError_t      cerr;
1985:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1986:   cusparseStatus_t stat;
1987:  #endif

1990:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1991:   delete mmdata->Bcsr;
1992:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1993:   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1994:   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1995:   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1996:   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1997:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1998:   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1999:   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2000:  #endif
2001:   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2002:   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2003:  #endif
2004:   MatDestroy(&mmdata->X);
2005:   PetscFree(data);
2006:   return(0);
2007: }

2009: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

2011: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2012: {
2013:   Mat_Product                  *product = C->product;
2014:   Mat                          A,B;
2015:   PetscInt                     m,n,blda,clda;
2016:   PetscBool                    flg,biscuda;
2017:   Mat_SeqAIJCUSPARSE           *cusp;
2018:   cusparseStatus_t             stat;
2019:   cusparseOperation_t          opA;
2020:   const PetscScalar            *barray;
2021:   PetscScalar                  *carray;
2022:   PetscErrorCode               ierr;
2023:   MatMatCusparse               *mmdata;
2024:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2025:   CsrMatrix                    *csrmat;

2028:   MatCheckProduct(C,1);
2029:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2030:   mmdata = (MatMatCusparse*)product->data;
2031:   A    = product->A;
2032:   B    = product->B;
2033:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2034:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2035:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2036:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2037:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2038:   MatSeqAIJCUSPARSECopyToGPU(A);
2039:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2040:   switch (product->type) {
2041:   case MATPRODUCT_AB:
2042:   case MATPRODUCT_PtAP:
2043:     mat = cusp->mat;
2044:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2045:     m   = A->rmap->n;
2046:     n   = B->cmap->n;
2047:     break;
2048:   case MATPRODUCT_AtB:
2049:     if (!A->form_explicit_transpose) {
2050:       mat = cusp->mat;
2051:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2052:     } else {
2053:       MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2054:       mat  = cusp->matTranspose;
2055:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056:     }
2057:     m = A->cmap->n;
2058:     n = B->cmap->n;
2059:     break;
2060:   case MATPRODUCT_ABt:
2061:   case MATPRODUCT_RARt:
2062:     mat = cusp->mat;
2063:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2064:     m   = A->rmap->n;
2065:     n   = B->rmap->n;
2066:     break;
2067:   default:
2068:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2069:   }
2070:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2071:   csrmat = (CsrMatrix*)mat->mat;
2072:   /* if the user passed a CPU matrix, copy the data to the GPU */
2073:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2074:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2075:   MatDenseCUDAGetArrayRead(B,&barray);

2077:   MatDenseGetLDA(B,&blda);
2078:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2079:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2080:     MatDenseGetLDA(mmdata->X,&clda);
2081:   } else {
2082:     MatDenseCUDAGetArrayWrite(C,&carray);
2083:     MatDenseGetLDA(C,&clda);
2084:   }

2086:   PetscLogGpuTimeBegin();
2087:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2089:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2090:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2091:     size_t mmBufferSize;
2092:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2093:     if (!mmdata->matBDescr) {
2094:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2095:       mmdata->Blda = blda;
2096:     }

2098:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2099:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2100:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2101:       mmdata->Clda = clda;
2102:     }

2104:     if (!mat->matDescr) {
2105:       stat = cusparseCreateCsr(&mat->matDescr,
2106:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2107:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2108:                                csrmat->values->data().get(),
2109:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2110:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2111:     }
2112:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2113:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2114:                                    mmdata->matCDescr,cusparse_scalartype,
2115:                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2116:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2117:       cudaError_t cerr;
2118:       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2119:       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2120:       mmdata->mmBufferSize = mmBufferSize;
2121:     }
2122:     mmdata->initialized = PETSC_TRUE;
2123:   } else {
2124:     /* to be safe, always update pointers of the mats */
2125:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2126:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2127:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2128:   }

2130:   /* do cusparseSpMM, which supports transpose on B */
2131:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2132:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2133:                       mmdata->matCDescr,cusparse_scalartype,
2134:                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2135:  #else
2136:   PetscInt k;
2137:   /* cusparseXcsrmm does not support transpose on B */
2138:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2139:     cublasHandle_t cublasv2handle;
2140:     cublasStatus_t cerr;

2142:     PetscCUBLASGetHandle(&cublasv2handle);
2143:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2144:                        B->cmap->n,B->rmap->n,
2145:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2146:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2147:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2148:     blda = B->cmap->n;
2149:     k    = B->cmap->n;
2150:   } else {
2151:     k    = B->rmap->n;
2152:   }

2154:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2155:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2156:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2157:                            csrmat->values->data().get(),
2158:                            csrmat->row_offsets->data().get(),
2159:                            csrmat->column_indices->data().get(),
2160:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2161:                            carray,clda);CHKERRCUSPARSE(stat);
2162:  #endif
2163:   PetscLogGpuTimeEnd();
2164:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2165:   MatDenseCUDARestoreArrayRead(B,&barray);
2166:   if (product->type == MATPRODUCT_RARt) {
2167:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2168:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2169:   } else if (product->type == MATPRODUCT_PtAP) {
2170:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2171:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2172:   } else {
2173:     MatDenseCUDARestoreArrayWrite(C,&carray);
2174:   }
2175:   if (mmdata->cisdense) {
2176:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2177:   }
2178:   if (!biscuda) {
2179:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2180:   }
2181:   return(0);
2182: }

2184: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2185: {
2186:   Mat_Product        *product = C->product;
2187:   Mat                A,B;
2188:   PetscInt           m,n;
2189:   PetscBool          cisdense,flg;
2190:   PetscErrorCode     ierr;
2191:   MatMatCusparse     *mmdata;
2192:   Mat_SeqAIJCUSPARSE *cusp;

2195:   MatCheckProduct(C,1);
2196:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2197:   A    = product->A;
2198:   B    = product->B;
2199:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2200:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2201:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2202:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2203:   switch (product->type) {
2204:   case MATPRODUCT_AB:
2205:     m = A->rmap->n;
2206:     n = B->cmap->n;
2207:     break;
2208:   case MATPRODUCT_AtB:
2209:     m = A->cmap->n;
2210:     n = B->cmap->n;
2211:     break;
2212:   case MATPRODUCT_ABt:
2213:     m = A->rmap->n;
2214:     n = B->rmap->n;
2215:     break;
2216:   case MATPRODUCT_PtAP:
2217:     m = B->cmap->n;
2218:     n = B->cmap->n;
2219:     break;
2220:   case MATPRODUCT_RARt:
2221:     m = B->rmap->n;
2222:     n = B->rmap->n;
2223:     break;
2224:   default:
2225:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2226:   }
2227:   MatSetSizes(C,m,n,m,n);
2228:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2229:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2230:   MatSetType(C,MATSEQDENSECUDA);

2232:   /* product data */
2233:   PetscNew(&mmdata);
2234:   mmdata->cisdense = cisdense;
2235:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2236:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2237:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2238:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2239:   }
2240:  #endif
2241:   /* for these products we need intermediate storage */
2242:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2243:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2244:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2245:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2246:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2247:     } else {
2248:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2249:     }
2250:   }
2251:   C->product->data    = mmdata;
2252:   C->product->destroy = MatDestroy_MatMatCusparse;

2254:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2255:   return(0);
2256: }

2258: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2259: {
2260:   Mat_Product                  *product = C->product;
2261:   Mat                          A,B;
2262:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2263:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2264:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2265:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2266:   PetscBool                    flg;
2267:   PetscErrorCode               ierr;
2268:   cusparseStatus_t             stat;
2269:   cudaError_t                  cerr;
2270:   MatProductType               ptype;
2271:   MatMatCusparse               *mmdata;
2272: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273:   cusparseSpMatDescr_t         BmatSpDescr;
2274: #endif
2275:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2278:   MatCheckProduct(C,1);
2279:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2280:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2281:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2282:   mmdata = (MatMatCusparse*)C->product->data;
2283:   A = product->A;
2284:   B = product->B;
2285:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2286:     mmdata->reusesym = PETSC_FALSE;
2287:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288:     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2289:     Cmat = Ccusp->mat;
2290:     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2291:     Ccsr = (CsrMatrix*)Cmat->mat;
2292:     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2293:     goto finalize;
2294:   }
2295:   if (!c->nz) goto finalize;
2296:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2297:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2299:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2300:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2301:   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2302:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2303:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2304:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2305:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2306:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2307:   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2308:   MatSeqAIJCUSPARSECopyToGPU(A);
2309:   MatSeqAIJCUSPARSECopyToGPU(B);

2311:   ptype = product->type;
2312:   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2313:     ptype = MATPRODUCT_AB;
2314:     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2315:   }
2316:   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2317:     ptype = MATPRODUCT_AB;
2318:     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2319:   }
2320:   switch (ptype) {
2321:   case MATPRODUCT_AB:
2322:     Amat = Acusp->mat;
2323:     Bmat = Bcusp->mat;
2324:     break;
2325:   case MATPRODUCT_AtB:
2326:     Amat = Acusp->matTranspose;
2327:     Bmat = Bcusp->mat;
2328:     break;
2329:   case MATPRODUCT_ABt:
2330:     Amat = Acusp->mat;
2331:     Bmat = Bcusp->matTranspose;
2332:     break;
2333:   default:
2334:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2335:   }
2336:   Cmat = Ccusp->mat;
2337:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2338:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2339:   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2340:   Acsr = (CsrMatrix*)Amat->mat;
2341:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2342:   Ccsr = (CsrMatrix*)Cmat->mat;
2343:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2344:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2345:   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2346:   PetscLogGpuTimeBegin();
2347: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2348:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2349:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2350:   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2351:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2352:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2353:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2354:                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2355:   #else
2356:     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2357:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2358:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2359:                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2360:     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2361:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2362:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2363:   #endif
2364: #else
2365:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2366:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2367:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2368:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2369:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2370: #endif
2371:   PetscLogGpuFlops(mmdata->flops);
2372:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2373:   PetscLogGpuTimeEnd();
2374:   C->offloadmask = PETSC_OFFLOAD_GPU;
2375: finalize:
2376:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2377:   PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2378:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2379:   PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2380:   c->reallocs         = 0;
2381:   C->info.mallocs    += 0;
2382:   C->info.nz_unneeded = 0;
2383:   C->assembled = C->was_assembled = PETSC_TRUE;
2384:   C->num_ass++;
2385:   return(0);
2386: }

2388: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2389: {
2390:   Mat_Product                  *product = C->product;
2391:   Mat                          A,B;
2392:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2393:   Mat_SeqAIJ                   *a,*b,*c;
2394:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2395:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2396:   PetscInt                     i,j,m,n,k;
2397:   PetscBool                    flg;
2398:   PetscErrorCode               ierr;
2399:   cusparseStatus_t             stat;
2400:   cudaError_t                  cerr;
2401:   MatProductType               ptype;
2402:   MatMatCusparse               *mmdata;
2403:   PetscLogDouble               flops;
2404:   PetscBool                    biscompressed,ciscompressed;
2405: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2406:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2407:   cusparseSpMatDescr_t         BmatSpDescr;
2408: #else
2409:   int                          cnz;
2410: #endif
2411:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2414:   MatCheckProduct(C,1);
2415:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2416:   A    = product->A;
2417:   B    = product->B;
2418:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2419:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2420:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2421:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2422:   a = (Mat_SeqAIJ*)A->data;
2423:   b = (Mat_SeqAIJ*)B->data;
2424:   /* product data */
2425:   PetscNew(&mmdata);
2426:   C->product->data    = mmdata;
2427:   C->product->destroy = MatDestroy_MatMatCusparse;

2429:   MatSeqAIJCUSPARSECopyToGPU(A);
2430:   MatSeqAIJCUSPARSECopyToGPU(B);
2431:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2432:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2433:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2434:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");

2436:   ptype = product->type;
2437:   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2438:     ptype = MATPRODUCT_AB;
2439:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2440:   }
2441:   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2442:     ptype = MATPRODUCT_AB;
2443:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2444:   }
2445:   biscompressed = PETSC_FALSE;
2446:   ciscompressed = PETSC_FALSE;
2447:   switch (ptype) {
2448:   case MATPRODUCT_AB:
2449:     m = A->rmap->n;
2450:     n = B->cmap->n;
2451:     k = A->cmap->n;
2452:     Amat = Acusp->mat;
2453:     Bmat = Bcusp->mat;
2454:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2455:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2456:     break;
2457:   case MATPRODUCT_AtB:
2458:     m = A->cmap->n;
2459:     n = B->cmap->n;
2460:     k = A->rmap->n;
2461:     MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2462:     Amat = Acusp->matTranspose;
2463:     Bmat = Bcusp->mat;
2464:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2465:     break;
2466:   case MATPRODUCT_ABt:
2467:     m = A->rmap->n;
2468:     n = B->rmap->n;
2469:     k = A->cmap->n;
2470:     MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2471:     Amat = Acusp->mat;
2472:     Bmat = Bcusp->matTranspose;
2473:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2474:     break;
2475:   default:
2476:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2477:   }

2479:   /* create cusparse matrix */
2480:   MatSetSizes(C,m,n,m,n);
2481:   MatSetType(C,MATSEQAIJCUSPARSE);
2482:   c     = (Mat_SeqAIJ*)C->data;
2483:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2484:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2485:   Ccsr  = new CsrMatrix;

2487:   c->compressedrow.use = ciscompressed;
2488:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2489:     c->compressedrow.nrows = a->compressedrow.nrows;
2490:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2491:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2492:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2493:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2494:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2495:   } else {
2496:     c->compressedrow.nrows  = 0;
2497:     c->compressedrow.i      = NULL;
2498:     c->compressedrow.rindex = NULL;
2499:     Ccusp->workVector       = NULL;
2500:     Cmat->cprowIndices      = NULL;
2501:   }
2502:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2503:   Ccusp->mat      = Cmat;
2504:   Ccusp->mat->mat = Ccsr;
2505:   Ccsr->num_rows    = Ccusp->nrows;
2506:   Ccsr->num_cols    = n;
2507:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2508:   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2509:   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2510:   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2511:   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2512:   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2513:   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2514:   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2515:   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2516:   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2517:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2518:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2519:     c->nz = 0;
2520:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2521:     Ccsr->values = new THRUSTARRAY(c->nz);
2522:     goto finalizesym;
2523:   }

2525:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2526:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2527:   Acsr = (CsrMatrix*)Amat->mat;
2528:   if (!biscompressed) {
2529:     Bcsr = (CsrMatrix*)Bmat->mat;
2530: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2531:     BmatSpDescr = Bmat->matDescr;
2532: #endif
2533:   } else { /* we need to use row offsets for the full matrix */
2534:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2535:     Bcsr = new CsrMatrix;
2536:     Bcsr->num_rows       = B->rmap->n;
2537:     Bcsr->num_cols       = cBcsr->num_cols;
2538:     Bcsr->num_entries    = cBcsr->num_entries;
2539:     Bcsr->column_indices = cBcsr->column_indices;
2540:     Bcsr->values         = cBcsr->values;
2541:     if (!Bcusp->rowoffsets_gpu) {
2542:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2543:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2544:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2545:     }
2546:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2547:     mmdata->Bcsr = Bcsr;
2548: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549:     if (Bcsr->num_rows && Bcsr->num_cols) {
2550:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2551:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2552:                                Bcsr->values->data().get(),
2553:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2554:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2555:     }
2556:     BmatSpDescr = mmdata->matSpBDescr;
2557: #endif
2558:   }
2559:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2560:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2561:   /* precompute flops count */
2562:   if (ptype == MATPRODUCT_AB) {
2563:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2564:       const PetscInt st = a->i[i];
2565:       const PetscInt en = a->i[i+1];
2566:       for (j=st; j<en; j++) {
2567:         const PetscInt brow = a->j[j];
2568:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2569:       }
2570:     }
2571:   } else if (ptype == MATPRODUCT_AtB) {
2572:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2573:       const PetscInt anzi = a->i[i+1] - a->i[i];
2574:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2575:       flops += (2.*anzi)*bnzi;
2576:     }
2577:   } else { /* TODO */
2578:     flops = 0.;
2579:   }

2581:   mmdata->flops = flops;
2582:   PetscLogGpuTimeBegin();

2584: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2585:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2586:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2587:                           NULL, NULL, NULL,
2588:                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2589:                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2590:   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2591:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2592:  {
2593:   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2594:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2595:   */
2596:   void*  dBuffer1 = NULL;
2597:   void*  dBuffer2 = NULL;
2598:   void*  dBuffer3 = NULL;
2599:   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2600:   size_t bufferSize1 = 0;
2601:   size_t bufferSize2 = 0;
2602:   size_t bufferSize3 = 0;
2603:   size_t bufferSize4 = 0;
2604:   size_t bufferSize5 = 0;

2606:   /*----------------------------------------------------------------------*/
2607:   /* ask bufferSize1 bytes for external memory */
2608:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2609:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2610:                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2611:   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2612:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2613:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615:                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);

2617:   /*----------------------------------------------------------------------*/
2618:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2619:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2620:                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2621:   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2622:   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2623:   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2624:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2625:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2626:                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2627:   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2628:   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);

2630:   /*----------------------------------------------------------------------*/
2631:   /* get matrix C non-zero entries C_nnz1 */
2632:   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2633:   c->nz = (PetscInt) C_nnz1;
2634:   /* allocate matrix C */
2635:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2636:   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2637:   /* update matC with the new pointers */
2638:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2639:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);

2641:   /*----------------------------------------------------------------------*/
2642:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644:                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2645:   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2646:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2647:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2648:                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2649:   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2650:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2651:                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2652:                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2653:                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2654:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2655:  }
2656:  #else
2657:   size_t bufSize2;
2658:   /* ask bufferSize bytes for external memory */
2659:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2660:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2662:                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2663:   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2664:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2665:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2666:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2667:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2668:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2669:   /* ask bufferSize again bytes for external memory */
2670:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2671:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2672:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2673:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2674:   /* The CUSPARSE documentation is not clear, nor the API
2675:      We need both buffers to perform the operations properly!
2676:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2677:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2678:      is stored in the descriptor! What a messy API... */
2679:   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2680:   /* compute the intermediate product of A * B */
2681:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2682:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2685:   /* get matrix C non-zero entries C_nnz1 */
2686:   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2687:   c->nz = (PetscInt) C_nnz1;
2688:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2689:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2690:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2691:   Ccsr->values = new THRUSTARRAY(c->nz);
2692:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2693:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2694:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2695:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2696:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2697:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2698:  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2699: #else
2700:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2701:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2702:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2703:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2704:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2705:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2706:   c->nz = cnz;
2707:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2708:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2709:   Ccsr->values = new THRUSTARRAY(c->nz);
2710:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2712:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2713:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2714:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2715:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2716:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2717:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2718:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2719:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2720:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2721: #endif
2722:   PetscLogGpuFlops(mmdata->flops);
2723:   PetscLogGpuTimeEnd();
2724: finalizesym:
2725:   c->singlemalloc = PETSC_FALSE;
2726:   c->free_a       = PETSC_TRUE;
2727:   c->free_ij      = PETSC_TRUE;
2728:   PetscMalloc1(m+1,&c->i);
2729:   PetscMalloc1(c->nz,&c->j);
2730:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2731:     PetscInt *d_i = c->i;
2732:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2733:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2734:     ii   = *Ccsr->row_offsets;
2735:     jj   = *Ccsr->column_indices;
2736:     if (ciscompressed) d_i = c->compressedrow.i;
2737:     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2738:     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2739:   } else {
2740:     PetscInt *d_i = c->i;
2741:     if (ciscompressed) d_i = c->compressedrow.i;
2742:     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2743:     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2744:   }
2745:   if (ciscompressed) { /* need to expand host row offsets */
2746:     PetscInt r = 0;
2747:     c->i[0] = 0;
2748:     for (k = 0; k < c->compressedrow.nrows; k++) {
2749:       const PetscInt next = c->compressedrow.rindex[k];
2750:       const PetscInt old = c->compressedrow.i[k];
2751:       for (; r < next; r++) c->i[r+1] = old;
2752:     }
2753:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2754:   }
2755:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2756:   PetscMalloc1(m,&c->ilen);
2757:   PetscMalloc1(m,&c->imax);
2758:   c->maxnz = c->nz;
2759:   c->nonzerorowcnt = 0;
2760:   c->rmax = 0;
2761:   for (k = 0; k < m; k++) {
2762:     const PetscInt nn = c->i[k+1] - c->i[k];
2763:     c->ilen[k] = c->imax[k] = nn;
2764:     c->nonzerorowcnt += (PetscInt)!!nn;
2765:     c->rmax = PetscMax(c->rmax,nn);
2766:   }
2767:   MatMarkDiagonal_SeqAIJ(C);
2768:   PetscMalloc1(c->nz,&c->a);
2769:   Ccsr->num_entries = c->nz;

2771:   C->nonzerostate++;
2772:   PetscLayoutSetUp(C->rmap);
2773:   PetscLayoutSetUp(C->cmap);
2774:   Ccusp->nonzerostate = C->nonzerostate;
2775:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2776:   C->preallocated  = PETSC_TRUE;
2777:   C->assembled     = PETSC_FALSE;
2778:   C->was_assembled = PETSC_FALSE;
2779:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2780:     mmdata->reusesym = PETSC_TRUE;
2781:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2782:   }
2783:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2784:   return(0);
2785: }

2787: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2789: /* handles sparse or dense B */
2790: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2791: {
2792:   Mat_Product    *product = mat->product;
2794:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2797:   MatCheckProduct(mat,1);
2798:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2799:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2800:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2801:   }
2802:   if (product->type == MATPRODUCT_ABC) {
2803:     Ciscusp = PETSC_FALSE;
2804:     if (!product->C->boundtocpu) {
2805:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2806:     }
2807:   }
2808:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2809:     PetscBool usecpu = PETSC_FALSE;
2810:     switch (product->type) {
2811:     case MATPRODUCT_AB:
2812:       if (product->api_user) {
2813:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2814:         PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2815:         PetscOptionsEnd();
2816:       } else {
2817:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2818:         PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2819:         PetscOptionsEnd();
2820:       }
2821:       break;
2822:     case MATPRODUCT_AtB:
2823:       if (product->api_user) {
2824:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2825:         PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2826:         PetscOptionsEnd();
2827:       } else {
2828:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2829:         PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2830:         PetscOptionsEnd();
2831:       }
2832:       break;
2833:     case MATPRODUCT_PtAP:
2834:       if (product->api_user) {
2835:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2836:         PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2837:         PetscOptionsEnd();
2838:       } else {
2839:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2840:         PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2841:         PetscOptionsEnd();
2842:       }
2843:       break;
2844:     case MATPRODUCT_RARt:
2845:       if (product->api_user) {
2846:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2847:         PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2848:         PetscOptionsEnd();
2849:       } else {
2850:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2851:         PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2852:         PetscOptionsEnd();
2853:       }
2854:       break;
2855:     case MATPRODUCT_ABC:
2856:       if (product->api_user) {
2857:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2858:         PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2859:         PetscOptionsEnd();
2860:       } else {
2861:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2862:         PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2863:         PetscOptionsEnd();
2864:       }
2865:       break;
2866:     default:
2867:       break;
2868:     }
2869:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2870:   }
2871:   /* dispatch */
2872:   if (isdense) {
2873:     switch (product->type) {
2874:     case MATPRODUCT_AB:
2875:     case MATPRODUCT_AtB:
2876:     case MATPRODUCT_ABt:
2877:     case MATPRODUCT_PtAP:
2878:     case MATPRODUCT_RARt:
2879:      if (product->A->boundtocpu) {
2880:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2881:       } else {
2882:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2883:       }
2884:       break;
2885:     case MATPRODUCT_ABC:
2886:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2887:       break;
2888:     default:
2889:       break;
2890:     }
2891:   } else if (Biscusp && Ciscusp) {
2892:     switch (product->type) {
2893:     case MATPRODUCT_AB:
2894:     case MATPRODUCT_AtB:
2895:     case MATPRODUCT_ABt:
2896:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2897:       break;
2898:     case MATPRODUCT_PtAP:
2899:     case MATPRODUCT_RARt:
2900:     case MATPRODUCT_ABC:
2901:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2902:       break;
2903:     default:
2904:       break;
2905:     }
2906:   } else { /* fallback for AIJ */
2907:     MatProductSetFromOptions_SeqAIJ(mat);
2908:   }
2909:   return(0);
2910: }

2912: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2913: {

2917:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2918:   return(0);
2919: }

2921: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2922: {

2926:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2927:   return(0);
2928: }

2930: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2931: {

2935:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2936:   return(0);
2937: }

2939: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940: {

2944:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2945:   return(0);
2946: }

2948: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2949: {

2953:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2954:   return(0);
2955: }

2957: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2958: {
2959:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2960:   if (i < n) y[idx[i]] += x[i];
2961: }

2963: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2964: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2965: {
2966:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2967:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2969:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2970:   PetscErrorCode               ierr;
2971:   cusparseStatus_t             stat;
2972:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2973:   PetscBool                    compressed;
2974: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2975:   PetscInt                     nx,ny;
2976: #endif

2979:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2980:   if (!a->nonzerorowcnt) {
2981:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2982:     else {VecCopy_SeqCUDA(yy,zz);}
2983:     return(0);
2984:   }
2985:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2986:   MatSeqAIJCUSPARSECopyToGPU(A);
2987:   if (!trans) {
2988:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2989:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2990:   } else {
2991:     if (herm || !A->form_explicit_transpose) {
2992:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2993:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2994:     } else {
2995:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTranspose(A);}
2996:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2997:     }
2998:   }
2999:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3000:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3002:   try {
3003:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
3004:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
3005:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

3007:     PetscLogGpuTimeBegin();
3008:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3009:       /* z = A x + beta y.
3010:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3011:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3012:       */
3013:       xptr = xarray;
3014:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3015:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3016:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3017:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3018:           allocated to accommodate different uses. So we get the length info directly from mat.
3019:        */
3020:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3021:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3022:         nx = mat->num_cols;
3023:         ny = mat->num_rows;
3024:       }
3025:      #endif
3026:     } else {
3027:       /* z = A^T x + beta y
3028:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3029:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3030:        */
3031:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3032:       dptr = zarray;
3033:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3034:       if (compressed) { /* Scatter x to work vector */
3035:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3036:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3037:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3038:                          VecCUDAEqualsReverse());
3039:       }
3040:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3041:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043:         nx = mat->num_rows;
3044:         ny = mat->num_cols;
3045:       }
3046:      #endif
3047:     }

3049:     /* csr_spmv does y = alpha op(A) x + beta y */
3050:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3051:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3052:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3053:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3054:         cudaError_t cerr;
3055:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3056:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3057:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3058:                                 matstruct->matDescr,
3059:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3060:                                 matstruct->cuSpMV[opA].vecYDescr,
3061:                                 cusparse_scalartype,
3062:                                 cusparsestruct->spmvAlg,
3063:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3064:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

3066:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3067:       } else {
3068:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3069:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3070:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3071:       }

3073:       stat = cusparseSpMV(cusparsestruct->handle, opA,
3074:                                matstruct->alpha_one,
3075:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3076:                                matstruct->cuSpMV[opA].vecXDescr,
3077:                                beta,
3078:                                matstruct->cuSpMV[opA].vecYDescr,
3079:                                cusparse_scalartype,
3080:                                cusparsestruct->spmvAlg,
3081:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3082:      #else
3083:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3084:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3085:                                mat->num_rows, mat->num_cols,
3086:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3087:                                mat->values->data().get(), mat->row_offsets->data().get(),
3088:                                mat->column_indices->data().get(), xptr, beta,
3089:                                dptr);CHKERRCUSPARSE(stat);
3090:      #endif
3091:     } else {
3092:       if (cusparsestruct->nrows) {
3093:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3095:        #else
3096:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3097:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3098:                                  matstruct->alpha_one, matstruct->descr, hybMat,
3099:                                  xptr, beta,
3100:                                  dptr);CHKERRCUSPARSE(stat);
3101:        #endif
3102:       }
3103:     }
3104:     PetscLogGpuTimeEnd();

3106:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3108:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3109:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3110:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3111:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3112:         }
3113:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3114:         VecSet_SeqCUDA(zz,0);
3115:       }

3117:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3118:       if (compressed) {
3119:         PetscLogGpuTimeBegin();
3120:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3121:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3122:            prevent that. So I just add a ScatterAdd kernel.
3123:          */
3124:        #if 0
3125:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3126:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3127:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3128:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3129:                          VecCUDAPlusEquals());
3130:        #else
3131:         PetscInt n = matstruct->cprowIndices->size();
3132:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3133:        #endif
3134:         PetscLogGpuTimeEnd();
3135:       }
3136:     } else {
3137:       if (yy && yy != zz) {
3138:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3139:       }
3140:     }
3141:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3142:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3143:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
3144:   } catch(char *ex) {
3145:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3146:   }
3147:   if (yy) {
3148:     PetscLogGpuFlops(2.0*a->nz);
3149:   } else {
3150:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3151:   }
3152:   return(0);
3153: }

3155: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3156: {

3160:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3161:   return(0);
3162: }

3164: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3165: {
3166:   PetscErrorCode     ierr;
3167:   PetscObjectState   onnz = A->nonzerostate;
3168:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

3171:   MatAssemblyEnd_SeqAIJ(A,mode);
3172:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3173:     cudaError_t cerr;

3175:     PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3176:     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3177:     cusp->deviceMat = NULL;
3178:   }
3179:   return(0);
3180: }

3182: /* --------------------------------------------------------------------------------*/
3183: /*@
3184:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3185:    (the default parallel PETSc format). This matrix will ultimately pushed down
3186:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3187:    assembly performance the user should preallocate the matrix storage by setting
3188:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3189:    performance during matrix assembly can be increased by more than a factor of 50.

3191:    Collective

3193:    Input Parameters:
3194: +  comm - MPI communicator, set to PETSC_COMM_SELF
3195: .  m - number of rows
3196: .  n - number of columns
3197: .  nz - number of nonzeros per row (same for all rows)
3198: -  nnz - array containing the number of nonzeros in the various rows
3199:          (possibly different for each row) or NULL

3201:    Output Parameter:
3202: .  A - the matrix

3204:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3205:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3206:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3208:    Notes:
3209:    If nnz is given then nz is ignored

3211:    The AIJ format (also called the Yale sparse matrix format or
3212:    compressed row storage), is fully compatible with standard Fortran 77
3213:    storage.  That is, the stored row and column indices can begin at
3214:    either one (as in Fortran) or zero.  See the users' manual for details.

3216:    Specify the preallocated storage with either nz or nnz (not both).
3217:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3218:    allocation.  For large problems you MUST preallocate memory or you
3219:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3221:    By default, this format uses inodes (identical nodes) when possible, to
3222:    improve numerical efficiency of matrix-vector products and solves. We
3223:    search for consecutive rows with the same nonzero structure, thereby
3224:    reusing matrix information to achieve increased efficiency.

3226:    Level: intermediate

3228: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3229: @*/
3230: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3231: {

3235:   MatCreate(comm,A);
3236:   MatSetSizes(*A,m,n,m,n);
3237:   MatSetType(*A,MATSEQAIJCUSPARSE);
3238:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3239:   return(0);
3240: }

3242: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3243: {

3247:   if (A->factortype == MAT_FACTOR_NONE) {
3248:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3249:   } else {
3250:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3251:   }
3252:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3253:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3254:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3255:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3256:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3257:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3258:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3259:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3260:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3261:   MatDestroy_SeqAIJ(A);
3262:   return(0);
3263: }

3265: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3266: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3267: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3268: {

3272:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3273:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3274:   return(0);
3275: }

3277: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3278: {
3279:   PetscErrorCode     ierr;
3280:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3281:   Mat_SeqAIJCUSPARSE *cy;
3282:   Mat_SeqAIJCUSPARSE *cx;
3283:   PetscScalar        *ay;
3284:   const PetscScalar  *ax;
3285:   CsrMatrix          *csry,*csrx;

3288:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3289:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3290:   if (X->ops->axpy != Y->ops->axpy) {
3291:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3292:     MatAXPY_SeqAIJ(Y,a,X,str);
3293:     return(0);
3294:   }
3295:   /* if we are here, it means both matrices are bound to GPU */
3296:   MatSeqAIJCUSPARSECopyToGPU(Y);
3297:   MatSeqAIJCUSPARSECopyToGPU(X);
3298:   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3299:   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3300:   csry = (CsrMatrix*)cy->mat->mat;
3301:   csrx = (CsrMatrix*)cx->mat->mat;
3302:   /* see if we can turn this into a cublas axpy */
3303:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3304:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3305:     if (eq) {
3306:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3307:     }
3308:     if (eq) str = SAME_NONZERO_PATTERN;
3309:   }
3310:   /* spgeam is buggy with one column */
3311:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3313:   if (str == SUBSET_NONZERO_PATTERN) {
3314:     cusparseStatus_t stat;
3315:     PetscScalar      b = 1.0;
3316: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3317:     size_t           bufferSize;
3318:     void             *buffer;
3319:     cudaError_t      cerr;
3320: #endif

3322:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3323:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3324:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3325: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326:     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3327:                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3328:                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3329:                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3330:     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3331:     PetscLogGpuTimeBegin();
3332:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3333:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3334:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3335:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3336:     PetscLogGpuFlops(x->nz + y->nz);
3337:     PetscLogGpuTimeEnd();
3338:     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3339: #else
3340:     PetscLogGpuTimeBegin();
3341:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3342:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3343:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3344:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3345:     PetscLogGpuFlops(x->nz + y->nz);
3346:     PetscLogGpuTimeEnd();
3347: #endif
3348:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3349:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3350:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3351:     MatSeqAIJInvalidateDiagonal(Y);
3352:   } else if (str == SAME_NONZERO_PATTERN) {
3353:     cublasHandle_t cublasv2handle;
3354:     cublasStatus_t berr;
3355:     PetscBLASInt   one = 1, bnz = 1;

3357:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3358:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3359:     PetscCUBLASGetHandle(&cublasv2handle);
3360:     PetscBLASIntCast(x->nz,&bnz);
3361:     PetscLogGpuTimeBegin();
3362:     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3363:     PetscLogGpuFlops(2.0*bnz);
3364:     PetscLogGpuTimeEnd();
3365:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3366:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3367:     MatSeqAIJInvalidateDiagonal(Y);
3368:   } else {
3369:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3370:     MatAXPY_SeqAIJ(Y,a,X,str);
3371:   }
3372:   return(0);
3373: }

3375: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3376: {
3378:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3379:   PetscScalar    *ay;
3380:   cublasHandle_t cublasv2handle;
3381:   cublasStatus_t berr;
3382:   PetscBLASInt   one = 1, bnz = 1;

3385:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3386:   PetscCUBLASGetHandle(&cublasv2handle);
3387:   PetscBLASIntCast(y->nz,&bnz);
3388:   PetscLogGpuTimeBegin();
3389:   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3390:   PetscLogGpuFlops(bnz);
3391:   PetscLogGpuTimeEnd();
3392:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3393:   MatSeqAIJInvalidateDiagonal(Y);
3394:   return(0);
3395: }

3397: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3398: {
3400:   PetscBool      both = PETSC_FALSE;
3401:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3404:   if (A->factortype == MAT_FACTOR_NONE) {
3405:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3406:     if (spptr->mat) {
3407:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3408:       if (matrix->values) {
3409:         both = PETSC_TRUE;
3410:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3411:       }
3412:     }
3413:     if (spptr->matTranspose) {
3414:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3415:       if (matrix->values) {
3416:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3417:       }
3418:     }
3419:   }
3420:   //MatZeroEntries_SeqAIJ(A);
3421:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3422:   MatSeqAIJInvalidateDiagonal(A);
3423:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3425:   return(0);
3426: }

3428: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429: {
3430:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3434:   if (A->factortype != MAT_FACTOR_NONE) return(0);
3435:   if (flg) {
3436:     MatSeqAIJCUSPARSECopyFromGPU(A);

3438:     A->ops->scale                     = MatScale_SeqAIJ;
3439:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3440:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3441:     A->ops->mult                      = MatMult_SeqAIJ;
3442:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3443:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3444:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3445:     A->ops->multhermitiantranspose    = NULL;
3446:     A->ops->multhermitiantransposeadd = NULL;
3447:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3448:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3449:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3450:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3451:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3452:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3453:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3454:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3455:   } else {
3456:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3457:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3458:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3459:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3460:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3461:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3462:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3463:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3464:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3465:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3466:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3467:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3468:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3469:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3470:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3471:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3472:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3473:   }
3474:   A->boundtocpu = flg;
3475:   a->inode.use = flg;
3476:   return(0);
3477: }

3479: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3480: {
3481:   PetscErrorCode   ierr;
3482:   cusparseStatus_t stat;
3483:   Mat              B;

3486:   PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3487:   if (reuse == MAT_INITIAL_MATRIX) {
3488:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3489:   } else if (reuse == MAT_REUSE_MATRIX) {
3490:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3491:   }
3492:   B = *newmat;

3494:   PetscFree(B->defaultvectype);
3495:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3497:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3498:     if (B->factortype == MAT_FACTOR_NONE) {
3499:       Mat_SeqAIJCUSPARSE *spptr;
3500:       PetscNew(&spptr);
3501:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3502:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3503:       spptr->format     = MAT_CUSPARSE_CSR;
3504:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3505:      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3506:       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3507:      #else
3508:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3509:      #endif
3510:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3511:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3512:      #endif
3513:       B->spptr = spptr;
3514:     } else {
3515:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3517:       PetscNew(&spptr);
3518:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3519:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3520:       B->spptr = spptr;
3521:     }
3522:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3523:   }
3524:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3525:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3526:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3527:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3528:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3529:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3531:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3532:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3533:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3534: #if defined(PETSC_HAVE_HYPRE)
3535:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3536: #endif
3537:   return(0);
3538: }

3540: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3541: {

3545:   MatCreate_SeqAIJ(B);
3546:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3547:   return(0);
3548: }

3550: /*MC
3551:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3553:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3554:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3555:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3557:    Options Database Keys:
3558: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3559: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3560: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

3562:   Level: beginner

3564: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3565: M*/

3567: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3569: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3570: {

3574:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3575:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3576:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3577:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3578:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3580:   return(0);
3581: }

3583: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3584: {
3585:   PetscErrorCode   ierr;
3586:   cusparseStatus_t stat;

3589:   if (*cusparsestruct) {
3590:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3591:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3592:     delete (*cusparsestruct)->workVector;
3593:     delete (*cusparsestruct)->rowoffsets_gpu;
3594:     delete (*cusparsestruct)->cooPerm;
3595:     delete (*cusparsestruct)->cooPerm_a;
3596:     delete (*cusparsestruct)->csr2csc_i;
3597:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3598:     PetscFree(*cusparsestruct);
3599:   }
3600:   return(0);
3601: }

3603: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3604: {
3606:   if (*mat) {
3607:     delete (*mat)->values;
3608:     delete (*mat)->column_indices;
3609:     delete (*mat)->row_offsets;
3610:     delete *mat;
3611:     *mat = 0;
3612:   }
3613:   return(0);
3614: }

3616: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3617: {
3618:   cusparseStatus_t stat;
3619:   PetscErrorCode   ierr;

3622:   if (*trifactor) {
3623:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3624:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3625:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3626:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3627:     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3628:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3629:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3630:    #endif
3631:     PetscFree(*trifactor);
3632:   }
3633:   return(0);
3634: }

3636: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3637: {
3638:   CsrMatrix        *mat;
3639:   cusparseStatus_t stat;
3640:   cudaError_t      err;

3643:   if (*matstruct) {
3644:     if ((*matstruct)->mat) {
3645:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3646:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3647:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3648:        #else
3649:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3650:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3651:        #endif
3652:       } else {
3653:         mat = (CsrMatrix*)(*matstruct)->mat;
3654:         CsrMatrix_Destroy(&mat);
3655:       }
3656:     }
3657:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3658:     delete (*matstruct)->cprowIndices;
3659:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3660:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3661:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

3663:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3665:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3666:     for (int i=0; i<3; i++) {
3667:       if (mdata->cuSpMV[i].initialized) {
3668:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3669:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3670:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3671:       }
3672:     }
3673:    #endif
3674:     delete *matstruct;
3675:     *matstruct = NULL;
3676:   }
3677:   return(0);
3678: }

3680: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3681: {

3685:   if (*trifactors) {
3686:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3687:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3688:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3689:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3690:     delete (*trifactors)->rpermIndices;
3691:     delete (*trifactors)->cpermIndices;
3692:     delete (*trifactors)->workVector;
3693:     (*trifactors)->rpermIndices = NULL;
3694:     (*trifactors)->cpermIndices = NULL;
3695:     (*trifactors)->workVector = NULL;
3696:     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3697:     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3698:     (*trifactors)->init_dev_prop = PETSC_FALSE;
3699:   }
3700:   return(0);
3701: }

3703: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3704: {
3705:   PetscErrorCode   ierr;
3706:   cusparseHandle_t handle;
3707:   cusparseStatus_t stat;

3710:   if (*trifactors) {
3711:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3712:     if (handle = (*trifactors)->handle) {
3713:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3714:     }
3715:     PetscFree(*trifactors);
3716:   }
3717:   return(0);
3718: }

3720: struct IJCompare
3721: {
3722:   __host__ __device__
3723:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3724:   {
3725:     if (t1.get<0>() < t2.get<0>()) return true;
3726:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3727:     return false;
3728:   }
3729: };

3731: struct IJEqual
3732: {
3733:   __host__ __device__
3734:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3735:   {
3736:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3737:     return true;
3738:   }
3739: };

3741: struct IJDiff
3742: {
3743:   __host__ __device__
3744:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3745:   {
3746:     return t1 == t2 ? 0 : 1;
3747:   }
3748: };

3750: struct IJSum
3751: {
3752:   __host__ __device__
3753:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3754:   {
3755:     return t1||t2;
3756:   }
3757: };

3759: #include <thrust/iterator/discard_iterator.h>
3760: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3761: {
3762:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3763:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3764:   THRUSTARRAY                           *cooPerm_v = NULL;
3765:   thrust::device_ptr<const PetscScalar> d_v;
3766:   CsrMatrix                             *matrix;
3767:   PetscErrorCode                        ierr;
3768:   PetscInt                              n;

3771:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3772:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3773:   if (!cusp->cooPerm) {
3774:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3775:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3776:     return(0);
3777:   }
3778:   matrix = (CsrMatrix*)cusp->mat->mat;
3779:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780:   if (!v) {
3781:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3782:     goto finalize;
3783:   }
3784:   n = cusp->cooPerm->size();
3785:   if (isCudaMem(v)) {
3786:     d_v = thrust::device_pointer_cast(v);
3787:   } else {
3788:     cooPerm_v = new THRUSTARRAY(n);
3789:     cooPerm_v->assign(v,v+n);
3790:     d_v = cooPerm_v->data();
3791:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3792:   }
3793:   PetscLogGpuTimeBegin();
3794:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3795:     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3796:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3797:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3798:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3799:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3800:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3801:       */
3802:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3803:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3804:       delete cooPerm_w;
3805:     } else {
3806:       /* all nonzeros in d_v[] are unique entries */
3807:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3808:                                                                 matrix->values->begin()));
3809:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3810:                                                                 matrix->values->end()));
3811:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3812:     }
3813:   } else {
3814:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3815:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3816:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3817:     } else {
3818:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3819:                                                                 matrix->values->begin()));
3820:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3821:                                                                 matrix->values->end()));
3822:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3823:     }
3824:   }
3825:   PetscLogGpuTimeEnd();
3826: finalize:
3827:   delete cooPerm_v;
3828:   A->offloadmask = PETSC_OFFLOAD_GPU;
3829:   PetscObjectStateIncrease((PetscObject)A);
3830:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3831:   PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3832:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3833:   PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3834:   a->reallocs         = 0;
3835:   A->info.mallocs    += 0;
3836:   A->info.nz_unneeded = 0;
3837:   A->assembled = A->was_assembled = PETSC_TRUE;
3838:   A->num_ass++;
3839:   return(0);
3840: }

3842: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3843: {
3844:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845:   PetscErrorCode     ierr;

3849:   if (!cusp) return(0);
3850:   if (destroy) {
3851:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3852:     delete cusp->csr2csc_i;
3853:     cusp->csr2csc_i = NULL;
3854:   }
3855:   A->transupdated = PETSC_FALSE;
3856:   return(0);
3857: }

3859: #include <thrust/binary_search.h>
3860: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3861: {
3862:   PetscErrorCode     ierr;
3863:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3864:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3865:   PetscInt           cooPerm_n, nzr = 0;
3866:   cudaError_t        cerr;

3869:   PetscLayoutSetUp(A->rmap);
3870:   PetscLayoutSetUp(A->cmap);
3871:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3872:   if (n != cooPerm_n) {
3873:     delete cusp->cooPerm;
3874:     delete cusp->cooPerm_a;
3875:     cusp->cooPerm = NULL;
3876:     cusp->cooPerm_a = NULL;
3877:   }
3878:   if (n) {
3879:     THRUSTINTARRAY d_i(n);
3880:     THRUSTINTARRAY d_j(n);
3881:     THRUSTINTARRAY ii(A->rmap->n);

3883:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3884:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3886:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3887:     d_i.assign(coo_i,coo_i+n);
3888:     d_j.assign(coo_j,coo_j+n);

3890:     /* Ex.
3891:       n = 6
3892:       coo_i = [3,3,1,4,1,4]
3893:       coo_j = [3,2,2,5,2,6]
3894:     */
3895:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3896:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3898:     PetscLogGpuTimeBegin();
3899:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3900:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3901:     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3902:     THRUSTINTARRAY w = d_j;

3904:     /*
3905:       d_i     = [1,1,3,3,4,4]
3906:       d_j     = [2,2,2,3,5,6]
3907:       cooPerm = [2,4,1,0,3,5]
3908:     */
3909:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

3911:     /*
3912:       d_i     = [1,3,3,4,4,x]
3913:                             ^ekey
3914:       d_j     = [2,2,3,5,6,x]
3915:                            ^nekye
3916:     */
3917:     if (nekey == ekey) { /* all entries are unique */
3918:       delete cusp->cooPerm_a;
3919:       cusp->cooPerm_a = NULL;
3920:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3921:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3922:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3923:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3924:       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3925:       w[0] = 0;
3926:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3927:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3928:     }
3929:     thrust::counting_iterator<PetscInt> search_begin(0);
3930:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3931:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3932:                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3933:     PetscLogGpuTimeEnd();

3935:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3936:     a->singlemalloc = PETSC_FALSE;
3937:     a->free_a       = PETSC_TRUE;
3938:     a->free_ij      = PETSC_TRUE;
3939:     PetscMalloc1(A->rmap->n+1,&a->i);
3940:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3941:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3942:     a->nz = a->maxnz = a->i[A->rmap->n];
3943:     a->rmax = 0;
3944:     PetscMalloc1(a->nz,&a->a);
3945:     PetscMalloc1(a->nz,&a->j);
3946:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3947:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3948:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3949:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3950:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3951:       nzr += (PetscInt)!!(nnzr);
3952:       a->ilen[i] = a->imax[i] = nnzr;
3953:       a->rmax = PetscMax(a->rmax,nnzr);
3954:     }
3955:     a->nonzerorowcnt = nzr;
3956:     A->preallocated = PETSC_TRUE;
3957:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3958:     MatMarkDiagonal_SeqAIJ(A);
3959:   } else {
3960:     MatSeqAIJSetPreallocation(A,0,NULL);
3961:   }
3962:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3964:   /* We want to allocate the CUSPARSE struct for matvec now.
3965:      The code is so convoluted now that I prefer to copy zeros */
3966:   PetscArrayzero(a->a,a->nz);
3967:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3968:   A->offloadmask = PETSC_OFFLOAD_CPU;
3969:   A->nonzerostate++;
3970:   MatSeqAIJCUSPARSECopyToGPU(A);
3971:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);

3973:   A->assembled = PETSC_FALSE;
3974:   A->was_assembled = PETSC_FALSE;
3975:   return(0);
3976: }

3978: /*@C
3979:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.

3981:    Not collective

3983:     Input Parameters:
3984: +   A - the matrix
3985: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

3987:     Output Parameters:
3988: +   ia - the CSR row pointers
3989: -   ja - the CSR column indices

3991:     Level: developer

3993:     Notes:
3994:       When compressed is true, the CSR structure does not contain empty rows

3996: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3997: @*/
3998: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3999: {
4000:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4001:   CsrMatrix          *csr;
4002:   PetscErrorCode     ierr;
4003:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;

4007:   if (!i || !j) return(0);
4009:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4010:   MatSeqAIJCUSPARSECopyToGPU(A);
4011:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4012:   csr = (CsrMatrix*)cusp->mat->mat;
4013:   if (i) {
4014:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4015:       if (!cusp->rowoffsets_gpu) {
4016:         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4017:         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4018:         PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4019:       }
4020:       *i = cusp->rowoffsets_gpu->data().get();
4021:     } else *i = csr->row_offsets->data().get();
4022:   }
4023:   if (j) *j = csr->column_indices->data().get();
4024:   return(0);
4025: }

4027: /*@C
4028:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()

4030:    Not collective

4032:     Input Parameters:
4033: +   A - the matrix
4034: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4036:     Output Parameters:
4037: +   ia - the CSR row pointers
4038: -   ja - the CSR column indices

4040:     Level: developer

4042: .seealso: MatSeqAIJCUSPARSEGetIJ()
4043: @*/
4044: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4045: {
4049:   if (i) *i = NULL;
4050:   if (j) *j = NULL;
4051:   return(0);
4052: }

4054: /*@C
4055:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4057:    Not Collective

4059:    Input Parameter:
4060: .   A - a MATSEQAIJCUSPARSE matrix

4062:    Output Parameter:
4063: .   a - pointer to the device data

4065:    Level: developer

4067:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4069: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4070: @*/
4071: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4072: {
4073:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4074:   CsrMatrix          *csr;
4075:   PetscErrorCode     ierr;

4081:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4082:   MatSeqAIJCUSPARSECopyToGPU(A);
4083:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4084:   csr = (CsrMatrix*)cusp->mat->mat;
4085:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4086:   *a = csr->values->data().get();
4087:   return(0);
4088: }

4090: /*@C
4091:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()

4093:    Not Collective

4095:    Input Parameter:
4096: .   A - a MATSEQAIJCUSPARSE matrix

4098:    Output Parameter:
4099: .   a - pointer to the device data

4101:    Level: developer

4103: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4104: @*/
4105: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4106: {
4111:   *a = NULL;
4112:   return(0);
4113: }

4115: /*@C
4116:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4118:    Not Collective

4120:    Input Parameter:
4121: .   A - a MATSEQAIJCUSPARSE matrix

4123:    Output Parameter:
4124: .   a - pointer to the device data

4126:    Level: developer

4128:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4130: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4131: @*/
4132: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4133: {
4134:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4135:   CsrMatrix          *csr;
4136:   PetscErrorCode     ierr;

4142:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143:   MatSeqAIJCUSPARSECopyToGPU(A);
4144:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145:   csr = (CsrMatrix*)cusp->mat->mat;
4146:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4147:   *a = csr->values->data().get();
4148:   A->offloadmask = PETSC_OFFLOAD_GPU;
4149:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4150:   return(0);
4151: }
4152: /*@C
4153:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()

4155:    Not Collective

4157:    Input Parameter:
4158: .   A - a MATSEQAIJCUSPARSE matrix

4160:    Output Parameter:
4161: .   a - pointer to the device data

4163:    Level: developer

4165: .seealso: MatSeqAIJCUSPARSEGetArray()
4166: @*/
4167: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4168: {

4175:   PetscObjectStateIncrease((PetscObject)A);
4176:   *a = NULL;
4177:   return(0);
4178: }

4180: /*@C
4181:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4183:    Not Collective

4185:    Input Parameter:
4186: .   A - a MATSEQAIJCUSPARSE matrix

4188:    Output Parameter:
4189: .   a - pointer to the device data

4191:    Level: developer

4193:    Notes: does not trigger host-device copies and flags data validity on the GPU

4195: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4196: @*/
4197: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4198: {
4199:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4200:   CsrMatrix          *csr;
4201:   PetscErrorCode     ierr;

4207:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4208:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4209:   csr = (CsrMatrix*)cusp->mat->mat;
4210:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4211:   *a = csr->values->data().get();
4212:   A->offloadmask = PETSC_OFFLOAD_GPU;
4213:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4214:   return(0);
4215: }

4217: /*@C
4218:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()

4220:    Not Collective

4222:    Input Parameter:
4223: .   A - a MATSEQAIJCUSPARSE matrix

4225:    Output Parameter:
4226: .   a - pointer to the device data

4228:    Level: developer

4230: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4231: @*/
4232: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4233: {

4240:   PetscObjectStateIncrease((PetscObject)A);
4241:   *a = NULL;
4242:   return(0);
4243: }

4245: struct IJCompare4
4246: {
4247:   __host__ __device__
4248:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4249:   {
4250:     if (t1.get<0>() < t2.get<0>()) return true;
4251:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4252:     return false;
4253:   }
4254: };

4256: struct Shift
4257: {
4258:   int _shift;

4260:   Shift(int shift) : _shift(shift) {}
4261:   __host__ __device__
4262:   inline int operator() (const int &c)
4263:   {
4264:     return c + _shift;
4265:   }
4266: };

4268: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4269: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4270: {
4271:   PetscErrorCode               ierr;
4272:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4273:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4274:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4275:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4276:   PetscInt                     Annz,Bnnz;
4277:   cusparseStatus_t             stat;
4278:   PetscInt                     i,m,n,zero = 0;
4279:   cudaError_t                  cerr;

4287:   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4288:   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4289:   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4290:   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4291:   if (reuse == MAT_INITIAL_MATRIX) {
4292:     m     = A->rmap->n;
4293:     n     = A->cmap->n + B->cmap->n;
4294:     MatCreate(PETSC_COMM_SELF,C);
4295:     MatSetSizes(*C,m,n,m,n);
4296:     MatSetType(*C,MATSEQAIJCUSPARSE);
4297:     c     = (Mat_SeqAIJ*)(*C)->data;
4298:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4299:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4300:     Ccsr  = new CsrMatrix;
4301:     Cmat->cprowIndices      = NULL;
4302:     c->compressedrow.use    = PETSC_FALSE;
4303:     c->compressedrow.nrows  = 0;
4304:     c->compressedrow.i      = NULL;
4305:     c->compressedrow.rindex = NULL;
4306:     Ccusp->workVector       = NULL;
4307:     Ccusp->nrows    = m;
4308:     Ccusp->mat      = Cmat;
4309:     Ccusp->mat->mat = Ccsr;
4310:     Ccsr->num_rows  = m;
4311:     Ccsr->num_cols  = n;
4312:     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4313:     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4314:     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4315:     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4316:     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4317:     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4318:     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4319:     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4320:     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4321:     MatSeqAIJCUSPARSECopyToGPU(A);
4322:     MatSeqAIJCUSPARSECopyToGPU(B);
4323:     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4324:     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");

4326:     Acsr = (CsrMatrix*)Acusp->mat->mat;
4327:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4328:     Annz = (PetscInt)Acsr->column_indices->size();
4329:     Bnnz = (PetscInt)Bcsr->column_indices->size();
4330:     c->nz = Annz + Bnnz;
4331:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4332:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4333:     Ccsr->values = new THRUSTARRAY(c->nz);
4334:     Ccsr->num_entries = c->nz;
4335:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4336:     if (c->nz) {
4337:       auto Acoo = new THRUSTINTARRAY32(Annz);
4338:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4339:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4340:       THRUSTINTARRAY32 *Aroff,*Broff;

4342:       if (a->compressedrow.use) { /* need full row offset */
4343:         if (!Acusp->rowoffsets_gpu) {
4344:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4345:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4346:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4347:         }
4348:         Aroff = Acusp->rowoffsets_gpu;
4349:       } else Aroff = Acsr->row_offsets;
4350:       if (b->compressedrow.use) { /* need full row offset */
4351:         if (!Bcusp->rowoffsets_gpu) {
4352:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4353:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4354:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4355:         }
4356:         Broff = Bcusp->rowoffsets_gpu;
4357:       } else Broff = Bcsr->row_offsets;
4358:       PetscLogGpuTimeBegin();
4359:       stat = cusparseXcsr2coo(Acusp->handle,
4360:                               Aroff->data().get(),
4361:                               Annz,
4362:                               m,
4363:                               Acoo->data().get(),
4364:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4365:       stat = cusparseXcsr2coo(Bcusp->handle,
4366:                               Broff->data().get(),
4367:                               Bnnz,
4368:                               m,
4369:                               Bcoo->data().get(),
4370:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4371:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4372:       auto Aperm = thrust::make_constant_iterator(1);
4373:       auto Bperm = thrust::make_constant_iterator(0);
4374: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4375:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4376:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4377: #else
4378:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4379:       auto Bcib = Bcsr->column_indices->begin();
4380:       auto Bcie = Bcsr->column_indices->end();
4381:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4382: #endif
4383:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4384:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4385:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4386:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4387:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4388:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4389:       auto p1 = Ccusp->cooPerm->begin();
4390:       auto p2 = Ccusp->cooPerm->begin();
4391:       thrust::advance(p2,Annz);
4392:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4393: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4394:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4395: #endif
4396:       auto cci = thrust::make_counting_iterator(zero);
4397:       auto cce = thrust::make_counting_iterator(c->nz);
4398: #if 0 //Errors on SUMMIT cuda 11.1.0
4399:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4400: #else
4401:       auto pred = thrust::identity<int>();
4402:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4403:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4404: #endif
4405:       stat = cusparseXcoo2csr(Ccusp->handle,
4406:                               Ccoo->data().get(),
4407:                               c->nz,
4408:                               m,
4409:                               Ccsr->row_offsets->data().get(),
4410:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4411:       PetscLogGpuTimeEnd();
4412:       delete wPerm;
4413:       delete Acoo;
4414:       delete Bcoo;
4415:       delete Ccoo;
4416: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4417:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4418:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4419:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4420:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4421: #endif
4422:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4423:         MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4424:         MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4425:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4426:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4427:         CsrMatrix *CcsrT = new CsrMatrix;
4428:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4429:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4431:         (*C)->form_explicit_transpose = PETSC_TRUE;
4432:         (*C)->transupdated = PETSC_TRUE;
4433:         Ccusp->rowoffsets_gpu = NULL;
4434:         CmatT->cprowIndices = NULL;
4435:         CmatT->mat = CcsrT;
4436:         CcsrT->num_rows = n;
4437:         CcsrT->num_cols = m;
4438:         CcsrT->num_entries = c->nz;

4440:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4441:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4442:         CcsrT->values = new THRUSTARRAY(c->nz);

4444:         PetscLogGpuTimeBegin();
4445:         auto rT = CcsrT->row_offsets->begin();
4446:         if (AT) {
4447:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4448:           thrust::advance(rT,-1);
4449:         }
4450:         if (BT) {
4451:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4452:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4453:           thrust::copy(titb,tite,rT);
4454:         }
4455:         auto cT = CcsrT->column_indices->begin();
4456:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4457:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4458:         auto vT = CcsrT->values->begin();
4459:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4460:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4461:         PetscLogGpuTimeEnd();

4463:         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4464:         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4465:         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4466:         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4467:         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4468:         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4469:         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4470:         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4471:         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4472: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4473:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4474:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4475:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4476:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4477: #endif
4478:         Ccusp->matTranspose = CmatT;
4479:       }
4480:     }

4482:     c->singlemalloc = PETSC_FALSE;
4483:     c->free_a       = PETSC_TRUE;
4484:     c->free_ij      = PETSC_TRUE;
4485:     PetscMalloc1(m+1,&c->i);
4486:     PetscMalloc1(c->nz,&c->j);
4487:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4488:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4489:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4490:       ii   = *Ccsr->row_offsets;
4491:       jj   = *Ccsr->column_indices;
4492:       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4493:       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4494:     } else {
4495:       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4496:       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4497:     }
4498:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4499:     PetscMalloc1(m,&c->ilen);
4500:     PetscMalloc1(m,&c->imax);
4501:     c->maxnz = c->nz;
4502:     c->nonzerorowcnt = 0;
4503:     c->rmax = 0;
4504:     for (i = 0; i < m; i++) {
4505:       const PetscInt nn = c->i[i+1] - c->i[i];
4506:       c->ilen[i] = c->imax[i] = nn;
4507:       c->nonzerorowcnt += (PetscInt)!!nn;
4508:       c->rmax = PetscMax(c->rmax,nn);
4509:     }
4510:     MatMarkDiagonal_SeqAIJ(*C);
4511:     PetscMalloc1(c->nz,&c->a);
4512:     (*C)->nonzerostate++;
4513:     PetscLayoutSetUp((*C)->rmap);
4514:     PetscLayoutSetUp((*C)->cmap);
4515:     Ccusp->nonzerostate = (*C)->nonzerostate;
4516:     (*C)->preallocated  = PETSC_TRUE;
4517:   } else {
4518:     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4519:     c = (Mat_SeqAIJ*)(*C)->data;
4520:     if (c->nz) {
4521:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4522:       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4523:       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4524:       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4525:       MatSeqAIJCUSPARSECopyToGPU(A);
4526:       MatSeqAIJCUSPARSECopyToGPU(B);
4527:       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4528:       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4529:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4530:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4531:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4532:       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4533:       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4534:       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4535:       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4536:       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4537:       auto pmid = Ccusp->cooPerm->begin();
4538:       thrust::advance(pmid,Acsr->num_entries);
4539:       PetscLogGpuTimeBegin();
4540:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4541:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4542:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4543:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4544:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4545:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4546:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4547:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4548:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4549:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4550:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4551:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4552:         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4553:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4554:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4555:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4556:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4557:         auto vT = CcsrT->values->begin();
4558:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4559:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4560:         (*C)->transupdated = PETSC_TRUE;
4561:       }
4562:       PetscLogGpuTimeEnd();
4563:     }
4564:   }
4565:   PetscObjectStateIncrease((PetscObject)*C);
4566:   (*C)->assembled     = PETSC_TRUE;
4567:   (*C)->was_assembled = PETSC_FALSE;
4568:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4569:   return(0);
4570: }

4572: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4573: {
4574:   PetscErrorCode    ierr;
4575:   bool              dmem;
4576:   const PetscScalar *av;
4577:   cudaError_t       cerr;

4580:   dmem = isCudaMem(v);
4581:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4582:   if (n && idx) {
4583:     THRUSTINTARRAY widx(n);
4584:     widx.assign(idx,idx+n);
4585:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4587:     THRUSTARRAY *w = NULL;
4588:     thrust::device_ptr<PetscScalar> dv;
4589:     if (dmem) {
4590:       dv = thrust::device_pointer_cast(v);
4591:     } else {
4592:       w = new THRUSTARRAY(n);
4593:       dv = w->data();
4594:     }
4595:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4597:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4598:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4599:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4600:     if (w) {
4601:       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4602:     }
4603:     delete w;
4604:   } else {
4605:     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606:   }
4607:   if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4608:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4609:   return(0);
4610: }