Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 16: #include <thrust/async/for_each.h>
 17: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 18: #include <cooperative_groups.h>
 19: #endif
 20: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 21: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 22:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 23:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 25:   typedef enum {
 26:       CUSPARSE_MV_ALG_DEFAULT = 0,
 27:       CUSPARSE_COOMV_ALG      = 1,
 28:       CUSPARSE_CSRMV_ALG1     = 2,
 29:       CUSPARSE_CSRMV_ALG2     = 3
 30:   } cusparseSpMVAlg_t;

 32:   typedef enum {
 33:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 34:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 35:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 36:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 37:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 38:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 39:       CUSPARSE_SPMM_COO_ALG1    = 1,
 40:       CUSPARSE_SPMM_COO_ALG2    = 2,
 41:       CUSPARSE_SPMM_COO_ALG3    = 3,
 42:       CUSPARSE_SPMM_COO_ALG4    = 5,
 43:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 44:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 45:   } cusparseSpMMAlg_t;

 47:   typedef enum {
 48:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 49:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 50:   } cusparseCsr2CscAlg_t;
 51:   */
 52:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 53:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 54:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 55: #endif

 57: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 58: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 61: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
 62: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
 63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 89: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
 90: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 91: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 93: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 94: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 96: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);

 98: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 99: {
100:   cusparseStatus_t   stat;
101:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

104:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105:   cusparsestruct->stream = stream;
106:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107:   return(0);
108: }

110: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111: {
112:   cusparseStatus_t   stat;
113:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

116:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117:   if (cusparsestruct->handle != handle) {
118:     if (cusparsestruct->handle) {
119:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120:     }
121:     cusparsestruct->handle = handle;
122:   }
123:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124:   return(0);
125: }

127: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128: {
129:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130:   PetscBool          flg;
131:   PetscErrorCode     ierr;

134:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
135:   if (!flg || !cusparsestruct) return(0);
136:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137:   return(0);
138: }

140: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141: {
143:   *type = MATSOLVERCUSPARSE;
144:   return(0);
145: }

147: /*MC
148:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153:   algorithms are not recommended. This class does NOT support direct solver operations.

155:   Level: beginner

157: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: M*/

160: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161: {
163:   PetscInt       n = A->rmap->n;

166:   MatCreate(PetscObjectComm((PetscObject)A),B);
167:   MatSetSizes(*B,n,n,n,n);
168:   (*B)->factortype = ftype;
169:   (*B)->useordering = PETSC_TRUE;
170:   MatSetType(*B,MATSEQAIJCUSPARSE);

172:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173:     MatSetBlockSizesFromMats(*B,A,A);
174:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

181:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
182:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
183:   return(0);
184: }

186: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187: {
188:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

191:   switch (op) {
192:   case MAT_CUSPARSE_MULT:
193:     cusparsestruct->format = format;
194:     break;
195:   case MAT_CUSPARSE_ALL:
196:     cusparsestruct->format = format;
197:     break;
198:   default:
199:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200:   }
201:   return(0);
202: }

204: /*@
205:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206:    operation. Only the MatMult operation can use different GPU storage formats
207:    for MPIAIJCUSPARSE matrices.
208:    Not Collective

210:    Input Parameters:
211: +  A - Matrix of type SEQAIJCUSPARSE
212: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

215:    Output Parameter:

217:    Level: intermediate

219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222: {

227:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
228:   return(0);
229: }

231: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232: {

236:   switch (op) {
237:     case MAT_FORM_EXPLICIT_TRANSPOSE:
238:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239:       if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
240:       A->form_explicit_transpose = flg;
241:       break;
242:     default:
243:       MatSetOption_SeqAIJ(A,op,flg);
244:       break;
245:   }
246:   return(0);
247: }

249: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

251: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252: {
253:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254:   IS             isrow = b->row,iscol = b->col;
255:   PetscBool      row_identity,col_identity;

259:   MatSeqAIJCUSPARSECopyFromGPU(A);
260:   MatLUFactorNumeric_SeqAIJ(B,A,info);
261:   B->offloadmask = PETSC_OFFLOAD_CPU;
262:   /* determine which version of MatSolve needs to be used. */
263:   ISIdentity(isrow,&row_identity);
264:   ISIdentity(iscol,&col_identity);
265:   if (row_identity && col_identity) {
266:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268:     B->ops->matsolve = NULL;
269:     B->ops->matsolvetranspose = NULL;
270:   } else {
271:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273:     B->ops->matsolve = NULL;
274:     B->ops->matsolvetranspose = NULL;
275:   }

277:   /* get the triangular factors */
278:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
279:   return(0);
280: }

282: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283: {
284:   PetscErrorCode           ierr;
285:   MatCUSPARSEStorageFormat format;
286:   PetscBool                flg;
287:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

290:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
291:   if (A->factortype == MAT_FACTOR_NONE) {
292:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
294:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

296:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
298:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
299:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

305:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
307:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

309:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
311:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312:    #endif
313:   }
314:   PetscOptionsTail();
315:   return(0);
316: }

318: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319: {
320:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321:   PetscErrorCode               ierr;

324:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
325:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
326:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327:   return(0);
328: }

330: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331: {
332:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333:   PetscErrorCode               ierr;

336:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
337:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
338:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339:   return(0);
340: }

342: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343: {
344:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345:   PetscErrorCode               ierr;

348:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
349:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
350:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351:   return(0);
352: }

354: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355: {
356:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357:   PetscErrorCode               ierr;

360:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
361:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
362:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363:   return(0);
364: }

366: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367: {
368:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369:   PetscInt                          n = A->rmap->n;
370:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372:   cusparseStatus_t                  stat;
373:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374:   const MatScalar                   *aa = a->a,*v;
375:   PetscInt                          *AiLo, *AjLo;
376:   PetscInt                          i,nz, nzLower, offset, rowOffset;
377:   PetscErrorCode                    ierr;
378:   cudaError_t                       cerr;

381:   if (!n) return(0);
382:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383:     try {
384:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385:       nzLower=n+ai[n]-ai[1];
386:       if (!loTriFactor) {
387:         PetscScalar                       *AALo;

389:         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

391:         /* Allocate Space for the lower triangular matrix */
392:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

395:         /* Fill the lower triangular matrix */
396:         AiLo[0]  = (PetscInt) 0;
397:         AiLo[n]  = nzLower;
398:         AjLo[0]  = (PetscInt) 0;
399:         AALo[0]  = (MatScalar) 1.0;
400:         v        = aa;
401:         vi       = aj;
402:         offset   = 1;
403:         rowOffset= 1;
404:         for (i=1; i<n; i++) {
405:           nz = ai[i+1] - ai[i];
406:           /* additional 1 for the term on the diagonal */
407:           AiLo[i]    = rowOffset;
408:           rowOffset += nz+1;

410:           PetscArraycpy(&(AjLo[offset]), vi, nz);
411:           PetscArraycpy(&(AALo[offset]), v, nz);

413:           offset      += nz;
414:           AjLo[offset] = (PetscInt) i;
415:           AALo[offset] = (MatScalar) 1.0;
416:           offset      += 1;

418:           v  += nz;
419:           vi += nz;
420:         }

422:         /* allocate space for the triangular factor information */
423:         PetscNew(&loTriFactor);
424:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425:         /* Create the matrix description */
426:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430:        #else
431:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432:        #endif
433:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

436:         /* set the operation */
437:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

439:         /* set the matrix */
440:         loTriFactor->csrMat = new CsrMatrix;
441:         loTriFactor->csrMat->num_rows = n;
442:         loTriFactor->csrMat->num_cols = n;
443:         loTriFactor->csrMat->num_entries = nzLower;

445:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

448:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

451:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

454:         /* Create the solve analysis information */
455:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
456:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464:       #endif

466:         /* perform the solve analysis */
467:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473:                                #endif
474: );CHKERRCUSPARSE(stat);
475:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

478:         /* assign the pointer */
479:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480:         loTriFactor->AA_h = AALo;
481:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
484:       } else { /* update values only */
485:         if (!loTriFactor->AA_h) {
486:           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487:         }
488:         /* Fill the lower triangular matrix */
489:         loTriFactor->AA_h[0]  = 1.0;
490:         v        = aa;
491:         vi       = aj;
492:         offset   = 1;
493:         for (i=1; i<n; i++) {
494:           nz = ai[i+1] - ai[i];
495:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
496:           offset      += nz;
497:           loTriFactor->AA_h[offset] = 1.0;
498:           offset      += 1;
499:           v  += nz;
500:         }
501:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
503:       }
504:     } catch(char *ex) {
505:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506:     }
507:   }
508:   return(0);
509: }

511: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512: {
513:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514:   PetscInt                          n = A->rmap->n;
515:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517:   cusparseStatus_t                  stat;
518:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519:   const MatScalar                   *aa = a->a,*v;
520:   PetscInt                          *AiUp, *AjUp;
521:   PetscInt                          i,nz, nzUpper, offset;
522:   PetscErrorCode                    ierr;
523:   cudaError_t                       cerr;

526:   if (!n) return(0);
527:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528:     try {
529:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530:       nzUpper = adiag[0]-adiag[n];
531:       if (!upTriFactor) {
532:         PetscScalar *AAUp;

534:         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

536:         /* Allocate Space for the upper triangular matrix */
537:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

540:         /* Fill the upper triangular matrix */
541:         AiUp[0]=(PetscInt) 0;
542:         AiUp[n]=nzUpper;
543:         offset = nzUpper;
544:         for (i=n-1; i>=0; i--) {
545:           v  = aa + adiag[i+1] + 1;
546:           vi = aj + adiag[i+1] + 1;

548:           /* number of elements NOT on the diagonal */
549:           nz = adiag[i] - adiag[i+1]-1;

551:           /* decrement the offset */
552:           offset -= (nz+1);

554:           /* first, set the diagonal elements */
555:           AjUp[offset] = (PetscInt) i;
556:           AAUp[offset] = (MatScalar)1./v[nz];
557:           AiUp[i]      = AiUp[i+1] - (nz+1);

559:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
560:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
561:         }

563:         /* allocate space for the triangular factor information */
564:         PetscNew(&upTriFactor);
565:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

567:         /* Create the matrix description */
568:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572:        #else
573:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574:        #endif
575:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

578:         /* set the operation */
579:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

581:         /* set the matrix */
582:         upTriFactor->csrMat = new CsrMatrix;
583:         upTriFactor->csrMat->num_rows = n;
584:         upTriFactor->csrMat->num_cols = n;
585:         upTriFactor->csrMat->num_entries = nzUpper;

587:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

590:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

593:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

596:         /* Create the solve analysis information */
597:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
598:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606:       #endif

608:         /* perform the solve analysis */
609:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615:                                #endif
616: );CHKERRCUSPARSE(stat);
617:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

620:         /* assign the pointer */
621:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622:         upTriFactor->AA_h = AAUp;
623:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
626:       } else {
627:         if (!upTriFactor->AA_h) {
628:           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629:         }
630:         /* Fill the upper triangular matrix */
631:         offset = nzUpper;
632:         for (i=n-1; i>=0; i--) {
633:           v  = aa + adiag[i+1] + 1;

635:           /* number of elements NOT on the diagonal */
636:           nz = adiag[i] - adiag[i+1]-1;

638:           /* decrement the offset */
639:           offset -= (nz+1);

641:           /* first, set the diagonal elements */
642:           upTriFactor->AA_h[offset] = 1./v[nz];
643:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
644:         }
645:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
647:       }
648:     } catch(char *ex) {
649:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650:     }
651:   }
652:   return(0);
653: }

655: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656: {
657:   PetscErrorCode               ierr;
658:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660:   IS                           isrow = a->row,iscol = a->icol;
661:   PetscBool                    row_identity,col_identity;
662:   PetscInt                     n = A->rmap->n;

665:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
667:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

669:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670:   cusparseTriFactors->nnz=a->nz;

672:   A->offloadmask = PETSC_OFFLOAD_BOTH;
673:   /* lower triangular indices */
674:   ISIdentity(isrow,&row_identity);
675:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676:     const PetscInt *r;

678:     ISGetIndices(isrow,&r);
679:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680:     cusparseTriFactors->rpermIndices->assign(r, r+n);
681:     ISRestoreIndices(isrow,&r);
682:     PetscLogCpuToGpu(n*sizeof(PetscInt));
683:   }

685:   /* upper triangular indices */
686:   ISIdentity(iscol,&col_identity);
687:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688:     const PetscInt *c;

690:     ISGetIndices(iscol,&c);
691:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692:     cusparseTriFactors->cpermIndices->assign(c, c+n);
693:     ISRestoreIndices(iscol,&c);
694:     PetscLogCpuToGpu(n*sizeof(PetscInt));
695:   }
696:   return(0);
697: }

699: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700: {
701:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705:   cusparseStatus_t                  stat;
706:   PetscErrorCode                    ierr;
707:   cudaError_t                       cerr;
708:   PetscInt                          *AiUp, *AjUp;
709:   PetscScalar                       *AAUp;
710:   PetscScalar                       *AALo;
711:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714:   const MatScalar                   *aa = b->a,*v;

717:   if (!n) return(0);
718:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719:     try {
720:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722:       if (!upTriFactor && !loTriFactor) {
723:         /* Allocate Space for the upper triangular matrix */
724:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

727:         /* Fill the upper triangular matrix */
728:         AiUp[0]=(PetscInt) 0;
729:         AiUp[n]=nzUpper;
730:         offset = 0;
731:         for (i=0; i<n; i++) {
732:           /* set the pointers */
733:           v  = aa + ai[i];
734:           vj = aj + ai[i];
735:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

737:           /* first, set the diagonal elements */
738:           AjUp[offset] = (PetscInt) i;
739:           AAUp[offset] = (MatScalar)1.0/v[nz];
740:           AiUp[i]      = offset;
741:           AALo[offset] = (MatScalar)1.0/v[nz];

743:           offset+=1;
744:           if (nz>0) {
745:             PetscArraycpy(&(AjUp[offset]), vj, nz);
746:             PetscArraycpy(&(AAUp[offset]), v, nz);
747:             for (j=offset; j<offset+nz; j++) {
748:               AAUp[j] = -AAUp[j];
749:               AALo[j] = AAUp[j]/v[nz];
750:             }
751:             offset+=nz;
752:           }
753:         }

755:         /* allocate space for the triangular factor information */
756:         PetscNew(&upTriFactor);
757:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

759:         /* Create the matrix description */
760:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764:        #else
765:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766:        #endif
767:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

770:         /* set the matrix */
771:         upTriFactor->csrMat = new CsrMatrix;
772:         upTriFactor->csrMat->num_rows = A->rmap->n;
773:         upTriFactor->csrMat->num_cols = A->cmap->n;
774:         upTriFactor->csrMat->num_entries = a->nz;

776:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

779:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

782:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

785:         /* set the operation */
786:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

788:         /* Create the solve analysis information */
789:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
790:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798:       #endif

800:         /* perform the solve analysis */
801:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807:                                 #endif
808: );CHKERRCUSPARSE(stat);
809:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

812:         /* assign the pointer */
813:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

815:         /* allocate space for the triangular factor information */
816:         PetscNew(&loTriFactor);
817:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

819:         /* Create the matrix description */
820:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824:        #else
825:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826:        #endif
827:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

830:         /* set the operation */
831:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

833:         /* set the matrix */
834:         loTriFactor->csrMat = new CsrMatrix;
835:         loTriFactor->csrMat->num_rows = A->rmap->n;
836:         loTriFactor->csrMat->num_cols = A->cmap->n;
837:         loTriFactor->csrMat->num_entries = a->nz;

839:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

842:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

845:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

848:         /* Create the solve analysis information */
849:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
850:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858:       #endif

860:         /* perform the solve analysis */
861:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867:                                 #endif
868: );CHKERRCUSPARSE(stat);
869:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

872:         /* assign the pointer */
873:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

875:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
876:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878:       } else {
879:         /* Fill the upper triangular matrix */
880:         offset = 0;
881:         for (i=0; i<n; i++) {
882:           /* set the pointers */
883:           v  = aa + ai[i];
884:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

886:           /* first, set the diagonal elements */
887:           AAUp[offset] = 1.0/v[nz];
888:           AALo[offset] = 1.0/v[nz];

890:           offset+=1;
891:           if (nz>0) {
892:             PetscArraycpy(&(AAUp[offset]), v, nz);
893:             for (j=offset; j<offset+nz; j++) {
894:               AAUp[j] = -AAUp[j];
895:               AALo[j] = AAUp[j]/v[nz];
896:             }
897:             offset+=nz;
898:           }
899:         }
900:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
905:       }
906:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908:     } catch(char *ex) {
909:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910:     }
911:   }
912:   return(0);
913: }

915: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916: {
917:   PetscErrorCode               ierr;
918:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920:   IS                           ip = a->row;
921:   PetscBool                    perm_identity;
922:   PetscInt                     n = A->rmap->n;

925:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
927:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

930:   A->offloadmask = PETSC_OFFLOAD_BOTH;

932:   /* lower triangular indices */
933:   ISIdentity(ip,&perm_identity);
934:   if (!perm_identity) {
935:     IS             iip;
936:     const PetscInt *irip,*rip;

938:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
939:     ISGetIndices(iip,&irip);
940:     ISGetIndices(ip,&rip);
941:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945:     ISRestoreIndices(iip,&irip);
946:     ISDestroy(&iip);
947:     ISRestoreIndices(ip,&rip);
948:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
949:   }
950:   return(0);
951: }

953: #define CHECK_LAUNCH_ERROR()                                                             \
954: do {                                                                                     \
955:   /* Check synchronous errors, i.e. pre-launch */                                        \
956:   cudaError_t err = cudaGetLastError();                                                  \
957:   if (cudaSuccess != err) {                                                              \
958:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959:   }                                                                                      \
960:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
961:   err = cudaDeviceSynchronize();                                                         \
962:   if (cudaSuccess != err) {                                                              \
963:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964:   }                                                                                      \
965:  } while (0)

967: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968: {
969:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970:   IS             ip = b->row;
971:   PetscBool      perm_identity;

975:   MatSeqAIJCUSPARSECopyFromGPU(A);
976:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
977:   B->offloadmask = PETSC_OFFLOAD_CPU;
978:   /* determine which version of MatSolve needs to be used. */
979:   ISIdentity(ip,&perm_identity);
980:   if (perm_identity) {
981:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983:     B->ops->matsolve = NULL;
984:     B->ops->matsolvetranspose = NULL;
985:   } else {
986:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988:     B->ops->matsolve = NULL;
989:     B->ops->matsolvetranspose = NULL;
990:   }

992:   /* get the triangular factors */
993:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
994:   return(0);
995: }

997: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998: {
999:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004:   cusparseStatus_t                  stat;
1005:   cusparseIndexBase_t               indexBase;
1006:   cusparseMatrixType_t              matrixType;
1007:   cusparseFillMode_t                fillMode;
1008:   cusparseDiagType_t                diagType;
1009:   cudaError_t                       cerr;
1010:   PetscErrorCode                    ierr;

1013:   /* allocate space for the transpose of the lower triangular factor */
1014:   PetscNew(&loTriFactorT);
1015:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1017:   /* set the matrix descriptors of the lower triangular factor */
1018:   matrixType = cusparseGetMatType(loTriFactor->descr);
1019:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1024:   /* Create the matrix description */
1025:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1031:   /* set the operation */
1032:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1034:   /* allocate GPU space for the CSC of the lower triangular factor*/
1035:   loTriFactorT->csrMat = new CsrMatrix;
1036:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1043:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047:                                        loTriFactor->csrMat->values->data().get(),
1048:                                        loTriFactor->csrMat->row_offsets->data().get(),
1049:                                        loTriFactor->csrMat->column_indices->data().get(),
1050:                                        loTriFactorT->csrMat->values->data().get(),
1051:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055: #endif

1057:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1058:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060:                           loTriFactor->csrMat->values->data().get(),
1061:                           loTriFactor->csrMat->row_offsets->data().get(),
1062:                           loTriFactor->csrMat->column_indices->data().get(),
1063:                           loTriFactorT->csrMat->values->data().get(),
1064:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068:                         #else
1069:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070:                           CUSPARSE_ACTION_NUMERIC, indexBase
1071:                         #endif
1072: );CHKERRCUSPARSE(stat);
1073:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1076:   /* Create the solve analysis information */
1077:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1078:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086: #endif

1088:   /* perform the solve analysis */
1089:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095:                           #endif
1096: );CHKERRCUSPARSE(stat);
1097:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1100:   /* assign the pointer */
1101:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1103:   /*********************************************/
1104:   /* Now the Transpose of the Upper Tri Factor */
1105:   /*********************************************/

1107:   /* allocate space for the transpose of the upper triangular factor */
1108:   PetscNew(&upTriFactorT);
1109:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1111:   /* set the matrix descriptors of the upper triangular factor */
1112:   matrixType = cusparseGetMatType(upTriFactor->descr);
1113:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1118:   /* Create the matrix description */
1119:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1125:   /* set the operation */
1126:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1128:   /* allocate GPU space for the CSC of the upper triangular factor*/
1129:   upTriFactorT->csrMat = new CsrMatrix;
1130:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1137:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141:                                 upTriFactor->csrMat->values->data().get(),
1142:                                 upTriFactor->csrMat->row_offsets->data().get(),
1143:                                 upTriFactor->csrMat->column_indices->data().get(),
1144:                                 upTriFactorT->csrMat->values->data().get(),
1145:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149: #endif

1151:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1152:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154:                           upTriFactor->csrMat->values->data().get(),
1155:                           upTriFactor->csrMat->row_offsets->data().get(),
1156:                           upTriFactor->csrMat->column_indices->data().get(),
1157:                           upTriFactorT->csrMat->values->data().get(),
1158:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162:                         #else
1163:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164:                           CUSPARSE_ACTION_NUMERIC, indexBase
1165:                         #endif
1166: );CHKERRCUSPARSE(stat);
1167:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1170:   /* Create the solve analysis information */
1171:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1172:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180:   #endif

1182:   /* perform the solve analysis */
1183:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189:                           #endif
1190: );CHKERRCUSPARSE(stat);
1191:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1194:   /* assign the pointer */
1195:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196:   return(0);
1197: }

1199: struct PetscScalarToPetscInt
1200: {
1201:   __host__ __device__
1202:   PetscInt operator()(PetscScalar s)
1203:   {
1204:     return (PetscInt)PetscRealPart(s);
1205:   }
1206: };

1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213:   cusparseStatus_t             stat;
1214:   cusparseIndexBase_t          indexBase;
1215:   cudaError_t                  err;
1216:   PetscErrorCode               ierr;

1219:   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220:   MatSeqAIJCUSPARSECopyToGPU(A);
1221:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222:   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224:   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225:   if (A->transupdated) return(0);
1226:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229:   }
1230:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232:     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234:     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235:     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1237:     /* set alpha and beta */
1238:     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239:     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240:     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241:     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242:     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243:     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

1245:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246:       CsrMatrix *matrixT = new CsrMatrix;
1247:       matstructT->mat = matrixT;
1248:       matrixT->num_rows = A->cmap->n;
1249:       matrixT->num_cols = A->rmap->n;
1250:       matrixT->num_entries = a->nz;
1251:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253:       matrixT->values = new THRUSTARRAY(a->nz);

1255:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1258:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259:       stat = cusparseCreateCsr(&matstructT->matDescr,
1260:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262:                                matrixT->values->data().get(),
1263:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264:                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265:      #endif
1266:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269:    #else
1270:       CsrMatrix *temp  = new CsrMatrix;
1271:       CsrMatrix *tempT = new CsrMatrix;
1272:       /* First convert HYB to CSR */
1273:       temp->num_rows = A->rmap->n;
1274:       temp->num_cols = A->cmap->n;
1275:       temp->num_entries = a->nz;
1276:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278:       temp->values = new THRUSTARRAY(a->nz);

1280:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282:                               temp->values->data().get(),
1283:                               temp->row_offsets->data().get(),
1284:                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1286:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287:       tempT->num_rows = A->rmap->n;
1288:       tempT->num_cols = A->cmap->n;
1289:       tempT->num_entries = a->nz;
1290:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292:       tempT->values = new THRUSTARRAY(a->nz);

1294:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295:                               temp->num_cols, temp->num_entries,
1296:                               temp->values->data().get(),
1297:                               temp->row_offsets->data().get(),
1298:                               temp->column_indices->data().get(),
1299:                               tempT->values->data().get(),
1300:                               tempT->column_indices->data().get(),
1301:                               tempT->row_offsets->data().get(),
1302:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1304:       /* Last, convert CSC to HYB */
1305:       cusparseHybMat_t hybMat;
1306:       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310:                               matstructT->descr, tempT->values->data().get(),
1311:                               tempT->row_offsets->data().get(),
1312:                               tempT->column_indices->data().get(),
1313:                               hybMat, 0, partition);CHKERRCUSPARSE(stat);

1315:       /* assign the pointer */
1316:       matstructT->mat = hybMat;
1317:       A->transupdated = PETSC_TRUE;
1318:       /* delete temporaries */
1319:       if (tempT) {
1320:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323:         delete (CsrMatrix*) tempT;
1324:       }
1325:       if (temp) {
1326:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329:         delete (CsrMatrix*) temp;
1330:       }
1331:      #endif
1332:     }
1333:   }
1334:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337:     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338:     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339:     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341:     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342:     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343:     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344:     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1349:     }
1350:     if (!cusparsestruct->csr2csc_i) {
1351:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1354:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356:       void   *csr2cscBuffer;
1357:       size_t csr2cscBufferSize;
1358:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359:                                            A->cmap->n, matrix->num_entries,
1360:                                            matrix->values->data().get(),
1361:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362:                                            matrix->column_indices->data().get(),
1363:                                            matrixT->values->data().get(),
1364:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367:       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368:      #endif

1370:       if (matrix->num_entries) {
1371:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1375:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376:            should be filled with indexBase. So I just take a shortcut here.
1377:         */
1378:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379:                               A->cmap->n,matrix->num_entries,
1380:                               csr2csc_a.data().get(),
1381:                               cusparsestruct->rowoffsets_gpu->data().get(),
1382:                               matrix->column_indices->data().get(),
1383:                               matrixT->values->data().get(),
1384:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1387:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388:                              #else
1389:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391:                              #endif
1392:       } else {
1393:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394:       }

1396:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399:       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400:      #endif
1401:     }
1402:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404:                                                      matrixT->values->begin()));
1405:   }
1406:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1407:   /* the compressed row indices is not used for matTranspose */
1408:   matstructT->cprowIndices = NULL;
1409:   /* assign the pointer */
1410:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411:   A->transupdated = PETSC_TRUE;
1412:   return(0);
1413: }

1415: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417: {
1418:   PetscInt                              n = xx->map->n;
1419:   const PetscScalar                     *barray;
1420:   PetscScalar                           *xarray;
1421:   thrust::device_ptr<const PetscScalar> bGPU;
1422:   thrust::device_ptr<PetscScalar>       xGPU;
1423:   cusparseStatus_t                      stat;
1424:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428:   PetscErrorCode                        ierr;
1429:   cudaError_t                           cerr;

1432:   /* Analyze the matrix and create the transpose ... on the fly */
1433:   if (!loTriFactorT && !upTriFactorT) {
1434:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1435:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437:   }

1439:   /* Get the GPU pointers */
1440:   VecCUDAGetArrayWrite(xx,&xarray);
1441:   VecCUDAGetArrayRead(bb,&barray);
1442:   xGPU = thrust::device_pointer_cast(xarray);
1443:   bGPU = thrust::device_pointer_cast(barray);

1445:   PetscLogGpuTimeBegin();
1446:   /* First, reorder with the row permutation */
1447:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449:                xGPU);

1451:   /* First, solve U */
1452:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453:                         upTriFactorT->csrMat->num_rows,
1454:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455:                         upTriFactorT->csrMat->num_entries,
1456:                       #endif
1457:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458:                         upTriFactorT->csrMat->values->data().get(),
1459:                         upTriFactorT->csrMat->row_offsets->data().get(),
1460:                         upTriFactorT->csrMat->column_indices->data().get(),
1461:                         upTriFactorT->solveInfo,
1462:                         xarray, tempGPU->data().get()
1463:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465:                       #endif
1466: );CHKERRCUSPARSE(stat);

1468:   /* Then, solve L */
1469:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470:                         loTriFactorT->csrMat->num_rows,
1471:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472:                         loTriFactorT->csrMat->num_entries,
1473:                       #endif
1474:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475:                         loTriFactorT->csrMat->values->data().get(),
1476:                         loTriFactorT->csrMat->row_offsets->data().get(),
1477:                         loTriFactorT->csrMat->column_indices->data().get(),
1478:                         loTriFactorT->solveInfo,
1479:                         tempGPU->data().get(), xarray
1480:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482:                       #endif
1483: );CHKERRCUSPARSE(stat);

1485:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488:                tempGPU->begin());

1490:   /* Copy the temporary to the full solution. */
1491:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1493:   /* restore */
1494:   VecCUDARestoreArrayRead(bb,&barray);
1495:   VecCUDARestoreArrayWrite(xx,&xarray);
1496:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497:   PetscLogGpuTimeEnd();
1498:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1499:   return(0);
1500: }

1502: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503: {
1504:   const PetscScalar                 *barray;
1505:   PetscScalar                       *xarray;
1506:   cusparseStatus_t                  stat;
1507:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511:   PetscErrorCode                    ierr;
1512:   cudaError_t                       cerr;

1515:   /* Analyze the matrix and create the transpose ... on the fly */
1516:   if (!loTriFactorT && !upTriFactorT) {
1517:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1518:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520:   }

1522:   /* Get the GPU pointers */
1523:   VecCUDAGetArrayWrite(xx,&xarray);
1524:   VecCUDAGetArrayRead(bb,&barray);

1526:   PetscLogGpuTimeBegin();
1527:   /* First, solve U */
1528:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529:                         upTriFactorT->csrMat->num_rows,
1530:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531:                         upTriFactorT->csrMat->num_entries,
1532:                       #endif
1533:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534:                         upTriFactorT->csrMat->values->data().get(),
1535:                         upTriFactorT->csrMat->row_offsets->data().get(),
1536:                         upTriFactorT->csrMat->column_indices->data().get(),
1537:                         upTriFactorT->solveInfo,
1538:                         barray, tempGPU->data().get()
1539:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541:                       #endif
1542: );CHKERRCUSPARSE(stat);

1544:   /* Then, solve L */
1545:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546:                         loTriFactorT->csrMat->num_rows,
1547:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548:                         loTriFactorT->csrMat->num_entries,
1549:                       #endif
1550:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551:                         loTriFactorT->csrMat->values->data().get(),
1552:                         loTriFactorT->csrMat->row_offsets->data().get(),
1553:                         loTriFactorT->csrMat->column_indices->data().get(),
1554:                         loTriFactorT->solveInfo,
1555:                         tempGPU->data().get(), xarray
1556:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558:                       #endif
1559: );CHKERRCUSPARSE(stat);

1561:   /* restore */
1562:   VecCUDARestoreArrayRead(bb,&barray);
1563:   VecCUDARestoreArrayWrite(xx,&xarray);
1564:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565:   PetscLogGpuTimeEnd();
1566:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1567:   return(0);
1568: }

1570: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571: {
1572:   const PetscScalar                     *barray;
1573:   PetscScalar                           *xarray;
1574:   thrust::device_ptr<const PetscScalar> bGPU;
1575:   thrust::device_ptr<PetscScalar>       xGPU;
1576:   cusparseStatus_t                      stat;
1577:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581:   PetscErrorCode                        ierr;
1582:   cudaError_t                           cerr;


1586:   /* Get the GPU pointers */
1587:   VecCUDAGetArrayWrite(xx,&xarray);
1588:   VecCUDAGetArrayRead(bb,&barray);
1589:   xGPU = thrust::device_pointer_cast(xarray);
1590:   bGPU = thrust::device_pointer_cast(barray);

1592:   PetscLogGpuTimeBegin();
1593:   /* First, reorder with the row permutation */
1594:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596:                tempGPU->begin());

1598:   /* Next, solve L */
1599:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600:                         loTriFactor->csrMat->num_rows,
1601:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602:                         loTriFactor->csrMat->num_entries,
1603:                       #endif
1604:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605:                         loTriFactor->csrMat->values->data().get(),
1606:                         loTriFactor->csrMat->row_offsets->data().get(),
1607:                         loTriFactor->csrMat->column_indices->data().get(),
1608:                         loTriFactor->solveInfo,
1609:                         tempGPU->data().get(), xarray
1610:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612:                       #endif
1613: );CHKERRCUSPARSE(stat);

1615:   /* Then, solve U */
1616:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617:                         upTriFactor->csrMat->num_rows,
1618:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619:                         upTriFactor->csrMat->num_entries,
1620:                       #endif
1621:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622:                         upTriFactor->csrMat->values->data().get(),
1623:                         upTriFactor->csrMat->row_offsets->data().get(),
1624:                         upTriFactor->csrMat->column_indices->data().get(),
1625:                         upTriFactor->solveInfo,
1626:                         xarray, tempGPU->data().get()
1627:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629:                       #endif
1630: );CHKERRCUSPARSE(stat);

1632:   /* Last, reorder with the column permutation */
1633:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635:                xGPU);

1637:   VecCUDARestoreArrayRead(bb,&barray);
1638:   VecCUDARestoreArrayWrite(xx,&xarray);
1639:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640:   PetscLogGpuTimeEnd();
1641:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1642:   return(0);
1643: }

1645: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646: {
1647:   const PetscScalar                 *barray;
1648:   PetscScalar                       *xarray;
1649:   cusparseStatus_t                  stat;
1650:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654:   PetscErrorCode                    ierr;
1655:   cudaError_t                       cerr;

1658:   /* Get the GPU pointers */
1659:   VecCUDAGetArrayWrite(xx,&xarray);
1660:   VecCUDAGetArrayRead(bb,&barray);

1662:   PetscLogGpuTimeBegin();
1663:   /* First, solve L */
1664:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665:                         loTriFactor->csrMat->num_rows,
1666:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667:                         loTriFactor->csrMat->num_entries,
1668:                       #endif
1669:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670:                         loTriFactor->csrMat->values->data().get(),
1671:                         loTriFactor->csrMat->row_offsets->data().get(),
1672:                         loTriFactor->csrMat->column_indices->data().get(),
1673:                         loTriFactor->solveInfo,
1674:                         barray, tempGPU->data().get()
1675:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677:                       #endif
1678: );CHKERRCUSPARSE(stat);

1680:   /* Next, solve U */
1681:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682:                         upTriFactor->csrMat->num_rows,
1683:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684:                         upTriFactor->csrMat->num_entries,
1685:                       #endif
1686:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687:                         upTriFactor->csrMat->values->data().get(),
1688:                         upTriFactor->csrMat->row_offsets->data().get(),
1689:                         upTriFactor->csrMat->column_indices->data().get(),
1690:                         upTriFactor->solveInfo,
1691:                         tempGPU->data().get(), xarray
1692:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694:                       #endif
1695: );CHKERRCUSPARSE(stat);

1697:   VecCUDARestoreArrayRead(bb,&barray);
1698:   VecCUDARestoreArrayWrite(xx,&xarray);
1699:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700:   PetscLogGpuTimeEnd();
1701:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1702:   return(0);
1703: }

1705: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706: {
1707:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1708:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709:   cudaError_t        cerr;
1710:   PetscErrorCode     ierr;

1713:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1716:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1717:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1720:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1721:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1722:   }
1723:   return(0);
1724: }

1726: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727: {
1728:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1732:   MatSeqAIJCUSPARSECopyFromGPU(A);
1733:   *array = a->a;
1734:   A->offloadmask = PETSC_OFFLOAD_CPU;
1735:   return(0);
1736: }

1738: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739: {
1740:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1743:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1744:   PetscErrorCode               ierr;
1745:   cusparseStatus_t             stat;
1746:   PetscBool                    both = PETSC_TRUE;
1747:   cudaError_t                  err;

1750:   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753:       CsrMatrix *matrix;
1754:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1756:       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1758:       matrix->values->assign(a->a, a->a+a->nz);
1759:       err  = WaitForCUDA();CHKERRCUDA(err);
1760:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1761:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1762:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1763:     } else {
1764:       PetscInt nnz;
1765:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1766:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1767:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1768:       delete cusparsestruct->workVector;
1769:       delete cusparsestruct->rowoffsets_gpu;
1770:       cusparsestruct->workVector = NULL;
1771:       cusparsestruct->rowoffsets_gpu = NULL;
1772:       try {
1773:         if (a->compressedrow.use) {
1774:           m    = a->compressedrow.nrows;
1775:           ii   = a->compressedrow.i;
1776:           ridx = a->compressedrow.rindex;
1777:         } else {
1778:           m    = A->rmap->n;
1779:           ii   = a->i;
1780:           ridx = NULL;
1781:         }
1782:         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783:         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785:         else nnz = a->nz;

1787:         /* create cusparse matrix */
1788:         cusparsestruct->nrows = m;
1789:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1794:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1802:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804:           /* set the matrix */
1805:           CsrMatrix *mat= new CsrMatrix;
1806:           mat->num_rows = m;
1807:           mat->num_cols = A->cmap->n;
1808:           mat->num_entries = nnz;
1809:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810:           mat->row_offsets->assign(ii, ii + m+1);

1812:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1813:           mat->column_indices->assign(a->j, a->j+nnz);

1815:           mat->values = new THRUSTARRAY(nnz);
1816:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1818:           /* assign the pointer */
1819:           matstruct->mat = mat;
1820:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822:             stat = cusparseCreateCsr(&matstruct->matDescr,
1823:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1824:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825:                                     mat->values->data().get(),
1826:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828:           }
1829:          #endif
1830:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833:          #else
1834:           CsrMatrix *mat= new CsrMatrix;
1835:           mat->num_rows = m;
1836:           mat->num_cols = A->cmap->n;
1837:           mat->num_entries = nnz;
1838:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839:           mat->row_offsets->assign(ii, ii + m+1);

1841:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1842:           mat->column_indices->assign(a->j, a->j+nnz);

1844:           mat->values = new THRUSTARRAY(nnz);
1845:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1847:           cusparseHybMat_t hybMat;
1848:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852:               matstruct->descr, mat->values->data().get(),
1853:               mat->row_offsets->data().get(),
1854:               mat->column_indices->data().get(),
1855:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856:           /* assign the pointer */
1857:           matstruct->mat = hybMat;

1859:           if (mat) {
1860:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1861:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863:             delete (CsrMatrix*)mat;
1864:           }
1865:          #endif
1866:         }

1868:         /* assign the compressed row indices */
1869:         if (a->compressedrow.use) {
1870:           cusparsestruct->workVector = new THRUSTARRAY(m);
1871:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1872:           matstruct->cprowIndices->assign(ridx,ridx+m);
1873:           tmp = m;
1874:         } else {
1875:           cusparsestruct->workVector = NULL;
1876:           matstruct->cprowIndices    = NULL;
1877:           tmp = 0;
1878:         }
1879:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1881:         /* assign the pointer */
1882:         cusparsestruct->mat = matstruct;
1883:       } catch(char *ex) {
1884:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885:       }
1886:       err  = WaitForCUDA();CHKERRCUDA(err);
1887:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1888:       cusparsestruct->nonzerostate = A->nonzerostate;
1889:     }
1890:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891:   }
1892:   return(0);
1893: }

1895: struct VecCUDAPlusEquals
1896: {
1897:   template <typename Tuple>
1898:   __host__ __device__
1899:   void operator()(Tuple t)
1900:   {
1901:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902:   }
1903: };

1905: struct VecCUDAEquals
1906: {
1907:   template <typename Tuple>
1908:   __host__ __device__
1909:   void operator()(Tuple t)
1910:   {
1911:     thrust::get<1>(t) = thrust::get<0>(t);
1912:   }
1913: };

1915: struct VecCUDAEqualsReverse
1916: {
1917:   template <typename Tuple>
1918:   __host__ __device__
1919:   void operator()(Tuple t)
1920:   {
1921:     thrust::get<0>(t) = thrust::get<1>(t);
1922:   }
1923: };

1925: struct MatMatCusparse {
1926:   PetscBool             cisdense;
1927:   PetscScalar           *Bt;
1928:   Mat                   X;
1929:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930:   PetscLogDouble        flops;
1931:   CsrMatrix             *Bcsr;
1932: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933:   cusparseSpMatDescr_t  matSpBDescr;
1934:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1935:   cusparseDnMatDescr_t  matBDescr;
1936:   cusparseDnMatDescr_t  matCDescr;
1937:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938:   size_t                mmBufferSize;
1939:   void                  *mmBuffer;
1940:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941:   cusparseSpGEMMDescr_t spgemmDesc;
1942: #endif
1943: };

1945: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946: {
1947:   PetscErrorCode   ierr;
1948:   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1949:   cudaError_t      cerr;
1950:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951:   cusparseStatus_t stat;
1952:  #endif

1955:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956:   delete mmdata->Bcsr;
1957:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958:   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959:   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960:   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961:   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962:   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963:   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964:  #endif
1965:   MatDestroy(&mmdata->X);
1966:   PetscFree(data);
1967:   return(0);
1968: }

1970: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1972: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973: {
1974:   Mat_Product                  *product = C->product;
1975:   Mat                          A,B;
1976:   PetscInt                     m,n,blda,clda;
1977:   PetscBool                    flg,biscuda;
1978:   Mat_SeqAIJCUSPARSE           *cusp;
1979:   cusparseStatus_t             stat;
1980:   cusparseOperation_t          opA;
1981:   const PetscScalar            *barray;
1982:   PetscScalar                  *carray;
1983:   PetscErrorCode               ierr;
1984:   MatMatCusparse               *mmdata;
1985:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1986:   CsrMatrix                    *csrmat;
1987:   cudaError_t                  cerr;

1990:   MatCheckProduct(C,1);
1991:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992:   mmdata = (MatMatCusparse*)product->data;
1993:   A    = product->A;
1994:   B    = product->B;
1995:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1996:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1999:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000:   MatSeqAIJCUSPARSECopyToGPU(A);
2001:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002:   switch (product->type) {
2003:   case MATPRODUCT_AB:
2004:   case MATPRODUCT_PtAP:
2005:     mat = cusp->mat;
2006:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007:     m   = A->rmap->n;
2008:     n   = B->cmap->n;
2009:     break;
2010:   case MATPRODUCT_AtB:
2011:     if (!A->form_explicit_transpose) {
2012:       mat = cusp->mat;
2013:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2014:     } else {
2015:       MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2016:       mat  = cusp->matTranspose;
2017:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018:     }
2019:     m = A->cmap->n;
2020:     n = B->cmap->n;
2021:     break;
2022:   case MATPRODUCT_ABt:
2023:   case MATPRODUCT_RARt:
2024:     mat = cusp->mat;
2025:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026:     m   = A->rmap->n;
2027:     n   = B->rmap->n;
2028:     break;
2029:   default:
2030:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031:   }
2032:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033:   csrmat = (CsrMatrix*)mat->mat;
2034:   /* if the user passed a CPU matrix, copy the data to the GPU */
2035:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2036:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2037:   MatDenseCUDAGetArrayRead(B,&barray);

2039:   MatDenseGetLDA(B,&blda);
2040:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2042:     MatDenseGetLDA(mmdata->X,&clda);
2043:   } else {
2044:     MatDenseCUDAGetArrayWrite(C,&carray);
2045:     MatDenseGetLDA(C,&clda);
2046:   }

2048:   PetscLogGpuTimeBegin();
2049:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051:   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053:     size_t mmBufferSize;
2054:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055:     if (!mmdata->matBDescr) {
2056:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057:       mmdata->Blda = blda;
2058:     }

2060:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063:       mmdata->Clda = clda;
2064:     }

2066:     if (!mat->matDescr) {
2067:       stat = cusparseCreateCsr(&mat->matDescr,
2068:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070:                                csrmat->values->data().get(),
2071:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073:     }
2074:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076:                                    mmdata->matCDescr,cusparse_scalartype,
2077:                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079:       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080:       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081:       mmdata->mmBufferSize = mmBufferSize;
2082:     }
2083:     mmdata->initialized = PETSC_TRUE;
2084:   } else {
2085:     /* to be safe, always update pointers of the mats */
2086:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089:   }

2091:   /* do cusparseSpMM, which supports transpose on B */
2092:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094:                       mmdata->matCDescr,cusparse_scalartype,
2095:                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096:  #else
2097:   PetscInt k;
2098:   /* cusparseXcsrmm does not support transpose on B */
2099:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100:     cublasHandle_t cublasv2handle;
2101:     cublasStatus_t cerr;

2103:     PetscCUBLASGetHandle(&cublasv2handle);
2104:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105:                        B->cmap->n,B->rmap->n,
2106:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2107:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2108:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109:     blda = B->cmap->n;
2110:     k    = B->cmap->n;
2111:   } else {
2112:     k    = B->rmap->n;
2113:   }

2115:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2118:                            csrmat->values->data().get(),
2119:                            csrmat->row_offsets->data().get(),
2120:                            csrmat->column_indices->data().get(),
2121:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122:                            carray,clda);CHKERRCUSPARSE(stat);
2123:  #endif
2124:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125:   PetscLogGpuTimeEnd();
2126:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2127:   MatDenseCUDARestoreArrayRead(B,&barray);
2128:   if (product->type == MATPRODUCT_RARt) {
2129:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2130:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2131:   } else if (product->type == MATPRODUCT_PtAP) {
2132:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2133:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2134:   } else {
2135:     MatDenseCUDARestoreArrayWrite(C,&carray);
2136:   }
2137:   if (mmdata->cisdense) {
2138:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2139:   }
2140:   if (!biscuda) {
2141:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2142:   }
2143:   return(0);
2144: }

2146: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147: {
2148:   Mat_Product        *product = C->product;
2149:   Mat                A,B;
2150:   PetscInt           m,n;
2151:   PetscBool          cisdense,flg;
2152:   PetscErrorCode     ierr;
2153:   MatMatCusparse     *mmdata;
2154:   Mat_SeqAIJCUSPARSE *cusp;

2157:   MatCheckProduct(C,1);
2158:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159:   A    = product->A;
2160:   B    = product->B;
2161:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2162:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165:   switch (product->type) {
2166:   case MATPRODUCT_AB:
2167:     m = A->rmap->n;
2168:     n = B->cmap->n;
2169:     break;
2170:   case MATPRODUCT_AtB:
2171:     m = A->cmap->n;
2172:     n = B->cmap->n;
2173:     break;
2174:   case MATPRODUCT_ABt:
2175:     m = A->rmap->n;
2176:     n = B->rmap->n;
2177:     break;
2178:   case MATPRODUCT_PtAP:
2179:     m = B->cmap->n;
2180:     n = B->cmap->n;
2181:     break;
2182:   case MATPRODUCT_RARt:
2183:     m = B->rmap->n;
2184:     n = B->rmap->n;
2185:     break;
2186:   default:
2187:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188:   }
2189:   MatSetSizes(C,m,n,m,n);
2190:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2192:   MatSetType(C,MATSEQDENSECUDA);

2194:   /* product data */
2195:   PetscNew(&mmdata);
2196:   mmdata->cisdense = cisdense;
2197:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201:   }
2202:  #endif
2203:   /* for these products we need intermediate storage */
2204:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2206:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2207:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2209:     } else {
2210:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2211:     }
2212:   }
2213:   C->product->data    = mmdata;
2214:   C->product->destroy = MatDestroy_MatMatCusparse;

2216:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217:   return(0);
2218: }

2220: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221: {
2222:   Mat_Product                  *product = C->product;
2223:   Mat                          A,B;
2224:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2225:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2226:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2228:   PetscBool                    flg;
2229:   PetscErrorCode               ierr;
2230:   cusparseStatus_t             stat;
2231:   cudaError_t                  cerr;
2232:   MatProductType               ptype;
2233:   MatMatCusparse               *mmdata;
2234: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235:   cusparseSpMatDescr_t         BmatSpDescr;
2236: #endif

2239:   MatCheckProduct(C,1);
2240:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2242:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243:   mmdata = (MatMatCusparse*)C->product->data;
2244:   A = product->A;
2245:   B = product->B;
2246:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247:     mmdata->reusesym = PETSC_FALSE;
2248:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249:     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250:     Cmat = Ccusp->mat;
2251:     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252:     Ccsr = (CsrMatrix*)Cmat->mat;
2253:     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254:     goto finalize;
2255:   }
2256:   if (!c->nz) goto finalize;
2257:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2258:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2260:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262:   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268:   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269:   MatSeqAIJCUSPARSECopyToGPU(A);
2270:   MatSeqAIJCUSPARSECopyToGPU(B);

2272:   ptype = product->type;
2273:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275:   switch (ptype) {
2276:   case MATPRODUCT_AB:
2277:     Amat = Acusp->mat;
2278:     Bmat = Bcusp->mat;
2279:     break;
2280:   case MATPRODUCT_AtB:
2281:     Amat = Acusp->matTranspose;
2282:     Bmat = Bcusp->mat;
2283:     break;
2284:   case MATPRODUCT_ABt:
2285:     Amat = Acusp->mat;
2286:     Bmat = Bcusp->matTranspose;
2287:     break;
2288:   default:
2289:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290:   }
2291:   Cmat = Ccusp->mat;
2292:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294:   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295:   Acsr = (CsrMatrix*)Amat->mat;
2296:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297:   Ccsr = (CsrMatrix*)Cmat->mat;
2298:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300:   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301:   PetscLogGpuTimeBegin();
2302: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311: #else
2312:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317: #endif
2318:   PetscLogGpuFlops(mmdata->flops);
2319:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320:   PetscLogGpuTimeEnd();
2321:   C->offloadmask = PETSC_OFFLOAD_GPU;
2322: finalize:
2323:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2324:   PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2325:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2326:   PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2327:   c->reallocs         = 0;
2328:   C->info.mallocs    += 0;
2329:   C->info.nz_unneeded = 0;
2330:   C->assembled = C->was_assembled = PETSC_TRUE;
2331:   C->num_ass++;
2332:   return(0);
2333: }

2335: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336: {
2337:   Mat_Product                  *product = C->product;
2338:   Mat                          A,B;
2339:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2340:   Mat_SeqAIJ                   *a,*b,*c;
2341:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2343:   PetscInt                     i,j,m,n,k;
2344:   PetscBool                    flg;
2345:   PetscErrorCode               ierr;
2346:   cusparseStatus_t             stat;
2347:   cudaError_t                  cerr;
2348:   MatProductType               ptype;
2349:   MatMatCusparse               *mmdata;
2350:   PetscLogDouble               flops;
2351:   PetscBool                    biscompressed,ciscompressed;
2352: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2354:   size_t                       bufSize2;
2355:   cusparseSpMatDescr_t         BmatSpDescr;
2356: #else
2357:   int                          cnz;
2358: #endif

2361:   MatCheckProduct(C,1);
2362:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363:   A    = product->A;
2364:   B    = product->B;
2365:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2366:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2368:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369:   a = (Mat_SeqAIJ*)A->data;
2370:   b = (Mat_SeqAIJ*)B->data;
2371:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");

2376:   /* product data */
2377:   PetscNew(&mmdata);
2378:   C->product->data    = mmdata;
2379:   C->product->destroy = MatDestroy_MatMatCusparse;

2381:   MatSeqAIJCUSPARSECopyToGPU(A);
2382:   MatSeqAIJCUSPARSECopyToGPU(B);
2383:   ptype = product->type;
2384:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386:   biscompressed = PETSC_FALSE;
2387:   ciscompressed = PETSC_FALSE;
2388:   switch (ptype) {
2389:   case MATPRODUCT_AB:
2390:     m = A->rmap->n;
2391:     n = B->cmap->n;
2392:     k = A->cmap->n;
2393:     Amat = Acusp->mat;
2394:     Bmat = Bcusp->mat;
2395:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397:     break;
2398:   case MATPRODUCT_AtB:
2399:     m = A->cmap->n;
2400:     n = B->cmap->n;
2401:     k = A->rmap->n;
2402:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2403:     Amat = Acusp->matTranspose;
2404:     Bmat = Bcusp->mat;
2405:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406:     break;
2407:   case MATPRODUCT_ABt:
2408:     m = A->rmap->n;
2409:     n = B->rmap->n;
2410:     k = A->cmap->n;
2411:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2412:     Amat = Acusp->mat;
2413:     Bmat = Bcusp->matTranspose;
2414:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415:     break;
2416:   default:
2417:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418:   }

2420:   /* create cusparse matrix */
2421:   MatSetSizes(C,m,n,m,n);
2422:   MatSetType(C,MATSEQAIJCUSPARSE);
2423:   c     = (Mat_SeqAIJ*)C->data;
2424:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2426:   Ccsr  = new CsrMatrix;

2428:   c->compressedrow.use = ciscompressed;
2429:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430:     c->compressedrow.nrows = a->compressedrow.nrows;
2431:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2432:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2433:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2434:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436:   } else {
2437:     c->compressedrow.nrows  = 0;
2438:     c->compressedrow.i      = NULL;
2439:     c->compressedrow.rindex = NULL;
2440:     Ccusp->workVector       = NULL;
2441:     Cmat->cprowIndices      = NULL;
2442:   }
2443:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2444:   Ccusp->mat      = Cmat;
2445:   Ccusp->mat->mat = Ccsr;
2446:   Ccsr->num_rows    = Ccusp->nrows;
2447:   Ccsr->num_cols    = n;
2448:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449:   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450:   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451:   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452:   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453:   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454:   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455:   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456:   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457:   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460:     c->nz = 0;
2461:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462:     Ccsr->values = new THRUSTARRAY(c->nz);
2463:     goto finalizesym;
2464:   }

2466:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468:   Acsr = (CsrMatrix*)Amat->mat;
2469:   if (!biscompressed) {
2470:     Bcsr = (CsrMatrix*)Bmat->mat;
2471: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472:     BmatSpDescr = Bmat->matDescr;
2473: #endif
2474:   } else { /* we need to use row offsets for the full matrix */
2475:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476:     Bcsr = new CsrMatrix;
2477:     Bcsr->num_rows       = B->rmap->n;
2478:     Bcsr->num_cols       = cBcsr->num_cols;
2479:     Bcsr->num_entries    = cBcsr->num_entries;
2480:     Bcsr->column_indices = cBcsr->column_indices;
2481:     Bcsr->values         = cBcsr->values;
2482:     if (!Bcusp->rowoffsets_gpu) {
2483:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2484:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2486:     }
2487:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488:     mmdata->Bcsr = Bcsr;
2489: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490:     if (Bcsr->num_rows && Bcsr->num_cols) {
2491:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493:                                Bcsr->values->data().get(),
2494:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496:     }
2497:     BmatSpDescr = mmdata->matSpBDescr;
2498: #endif
2499:   }
2500:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502:   /* precompute flops count */
2503:   if (ptype == MATPRODUCT_AB) {
2504:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2505:       const PetscInt st = a->i[i];
2506:       const PetscInt en = a->i[i+1];
2507:       for (j=st; j<en; j++) {
2508:         const PetscInt brow = a->j[j];
2509:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2510:       }
2511:     }
2512:   } else if (ptype == MATPRODUCT_AtB) {
2513:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2514:       const PetscInt anzi = a->i[i+1] - a->i[i];
2515:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2516:       flops += (2.*anzi)*bnzi;
2517:     }
2518:   } else { /* TODO */
2519:     flops = 0.;
2520:   }

2522:   mmdata->flops = flops;
2523:   PetscLogGpuTimeBegin();
2524: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527:                            NULL, NULL, NULL,
2528:                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529:                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530:   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531:   /* ask bufferSize bytes for external memory */
2532:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535:                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536:   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2538:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542:   /* ask bufferSize again bytes for external memory */
2543:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547:   /* The CUSPARSE documentation is not clear, nor the API
2548:      We need both buffers to perform the operations properly!
2549:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551:      is stored in the descriptor! What a messy API... */
2552:   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553:   /* compute the intermediate product of A * B */
2554:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558:   /* get matrix C non-zero entries C_nnz1 */
2559:   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560:   c->nz = (PetscInt) C_nnz1;
2561:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2562:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564:   Ccsr->values = new THRUSTARRAY(c->nz);
2565:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571: #else
2572:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573:   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578:   c->nz = cnz;
2579:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581:   Ccsr->values = new THRUSTARRAY(c->nz);
2582:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2584:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593: #endif
2594:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595:   PetscLogGpuFlops(mmdata->flops);
2596:   PetscLogGpuTimeEnd();
2597: finalizesym:
2598:   c->singlemalloc = PETSC_FALSE;
2599:   c->free_a       = PETSC_TRUE;
2600:   c->free_ij      = PETSC_TRUE;
2601:   PetscMalloc1(m+1,&c->i);
2602:   PetscMalloc1(c->nz,&c->j);
2603:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604:     PetscInt *d_i = c->i;
2605:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607:     ii   = *Ccsr->row_offsets;
2608:     jj   = *Ccsr->column_indices;
2609:     if (ciscompressed) d_i = c->compressedrow.i;
2610:     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611:     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612:   } else {
2613:     PetscInt *d_i = c->i;
2614:     if (ciscompressed) d_i = c->compressedrow.i;
2615:     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616:     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617:   }
2618:   if (ciscompressed) { /* need to expand host row offsets */
2619:     PetscInt r = 0;
2620:     c->i[0] = 0;
2621:     for (k = 0; k < c->compressedrow.nrows; k++) {
2622:       const PetscInt next = c->compressedrow.rindex[k];
2623:       const PetscInt old = c->compressedrow.i[k];
2624:       for (; r < next; r++) c->i[r+1] = old;
2625:     }
2626:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627:   }
2628:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2629:   PetscMalloc1(m,&c->ilen);
2630:   PetscMalloc1(m,&c->imax);
2631:   c->maxnz = c->nz;
2632:   c->nonzerorowcnt = 0;
2633:   c->rmax = 0;
2634:   for (k = 0; k < m; k++) {
2635:     const PetscInt nn = c->i[k+1] - c->i[k];
2636:     c->ilen[k] = c->imax[k] = nn;
2637:     c->nonzerorowcnt += (PetscInt)!!nn;
2638:     c->rmax = PetscMax(c->rmax,nn);
2639:   }
2640:   MatMarkDiagonal_SeqAIJ(C);
2641:   PetscMalloc1(c->nz,&c->a);
2642:   Ccsr->num_entries = c->nz;

2644:   C->nonzerostate++;
2645:   PetscLayoutSetUp(C->rmap);
2646:   PetscLayoutSetUp(C->cmap);
2647:   Ccusp->nonzerostate = C->nonzerostate;
2648:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2649:   C->preallocated  = PETSC_TRUE;
2650:   C->assembled     = PETSC_FALSE;
2651:   C->was_assembled = PETSC_FALSE;
2652:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653:     mmdata->reusesym = PETSC_TRUE;
2654:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2655:   }
2656:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657:   return(0);
2658: }

2660: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2662: /* handles sparse or dense B */
2663: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664: {
2665:   Mat_Product    *product = mat->product;
2667:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2670:   MatCheckProduct(mat,1);
2671:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2672:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2674:   }
2675:   if (product->type == MATPRODUCT_ABC) {
2676:     Ciscusp = PETSC_FALSE;
2677:     if (!product->C->boundtocpu) {
2678:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2679:     }
2680:   }
2681:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2682:     PetscBool usecpu = PETSC_FALSE;
2683:     switch (product->type) {
2684:     case MATPRODUCT_AB:
2685:       if (product->api_user) {
2686:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2687:         PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2688:         PetscOptionsEnd();
2689:       } else {
2690:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2691:         PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2692:         PetscOptionsEnd();
2693:       }
2694:       break;
2695:     case MATPRODUCT_AtB:
2696:       if (product->api_user) {
2697:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2698:         PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2699:         PetscOptionsEnd();
2700:       } else {
2701:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2702:         PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2703:         PetscOptionsEnd();
2704:       }
2705:       break;
2706:     case MATPRODUCT_PtAP:
2707:       if (product->api_user) {
2708:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2709:         PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2710:         PetscOptionsEnd();
2711:       } else {
2712:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2713:         PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2714:         PetscOptionsEnd();
2715:       }
2716:       break;
2717:     case MATPRODUCT_RARt:
2718:       if (product->api_user) {
2719:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2720:         PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2721:         PetscOptionsEnd();
2722:       } else {
2723:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2724:         PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2725:         PetscOptionsEnd();
2726:       }
2727:       break;
2728:     case MATPRODUCT_ABC:
2729:       if (product->api_user) {
2730:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2731:         PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2732:         PetscOptionsEnd();
2733:       } else {
2734:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2735:         PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2736:         PetscOptionsEnd();
2737:       }
2738:       break;
2739:     default:
2740:       break;
2741:     }
2742:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2743:   }
2744:   /* dispatch */
2745:   if (isdense) {
2746:     switch (product->type) {
2747:     case MATPRODUCT_AB:
2748:     case MATPRODUCT_AtB:
2749:     case MATPRODUCT_ABt:
2750:     case MATPRODUCT_PtAP:
2751:     case MATPRODUCT_RARt:
2752:      if (product->A->boundtocpu) {
2753:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2754:       } else {
2755:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2756:       }
2757:       break;
2758:     case MATPRODUCT_ABC:
2759:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2760:       break;
2761:     default:
2762:       break;
2763:     }
2764:   } else if (Biscusp && Ciscusp) {
2765:     switch (product->type) {
2766:     case MATPRODUCT_AB:
2767:     case MATPRODUCT_AtB:
2768:     case MATPRODUCT_ABt:
2769:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2770:       break;
2771:     case MATPRODUCT_PtAP:
2772:     case MATPRODUCT_RARt:
2773:     case MATPRODUCT_ABC:
2774:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2775:       break;
2776:     default:
2777:       break;
2778:     }
2779:   } else { /* fallback for AIJ */
2780:     MatProductSetFromOptions_SeqAIJ(mat);
2781:   }
2782:   return(0);
2783: }

2785: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2786: {

2790:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2791:   return(0);
2792: }

2794: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2795: {

2799:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2800:   return(0);
2801: }

2803: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2804: {

2808:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2809:   return(0);
2810: }

2812: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2813: {

2817:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2818:   return(0);
2819: }

2821: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2822: {

2826:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2827:   return(0);
2828: }

2830: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2831: {
2832:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2833:   if (i < n) y[idx[i]] += x[i];
2834: }

2836: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2837: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2838: {
2839:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2840:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2841:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2842:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2843:   PetscErrorCode               ierr;
2844:   cudaError_t                  cerr;
2845:   cusparseStatus_t             stat;
2846:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2847:   PetscBool                    compressed;
2848: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849:   PetscInt                     nx,ny;
2850: #endif

2853:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2854:   if (!a->nonzerorowcnt) {
2855:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2856:     else {VecCopy_SeqCUDA(yy,zz);}
2857:     return(0);
2858:   }
2859:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2860:   MatSeqAIJCUSPARSECopyToGPU(A);
2861:   if (!trans) {
2862:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2863:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2864:   } else {
2865:     if (herm || !A->form_explicit_transpose) {
2866:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2867:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2868:     } else {
2869:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2870:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2871:     }
2872:   }
2873:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2874:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2876:   try {
2877:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2878:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2879:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2881:     PetscLogGpuTimeBegin();
2882:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2883:       /* z = A x + beta y.
2884:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2885:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2886:       */
2887:       xptr = xarray;
2888:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2889:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2890:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2891:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2892:           allocated to accommodate different uses. So we get the length info directly from mat.
2893:        */
2894:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2895:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2896:         nx = mat->num_cols;
2897:         ny = mat->num_rows;
2898:       }
2899:      #endif
2900:     } else {
2901:       /* z = A^T x + beta y
2902:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2903:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2904:        */
2905:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2906:       dptr = zarray;
2907:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2908:       if (compressed) { /* Scatter x to work vector */
2909:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2910:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2911:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2912:                          VecCUDAEqualsReverse());
2913:       }
2914:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2915:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2916:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2917:         nx = mat->num_rows;
2918:         ny = mat->num_cols;
2919:       }
2920:      #endif
2921:     }

2923:     /* csr_spmv does y = alpha op(A) x + beta y */
2924:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2925:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2926:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2927:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2928:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2929:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2930:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2931:                                 matstruct->matDescr,
2932:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2933:                                 matstruct->cuSpMV[opA].vecYDescr,
2934:                                 cusparse_scalartype,
2935:                                 cusparsestruct->spmvAlg,
2936:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2937:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2939:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2940:       } else {
2941:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2942:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2943:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2944:       }

2946:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2947:                                matstruct->alpha_one,
2948:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2949:                                matstruct->cuSpMV[opA].vecXDescr,
2950:                                beta,
2951:                                matstruct->cuSpMV[opA].vecYDescr,
2952:                                cusparse_scalartype,
2953:                                cusparsestruct->spmvAlg,
2954:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2955:      #else
2956:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2957:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2958:                                mat->num_rows, mat->num_cols,
2959:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2960:                                mat->values->data().get(), mat->row_offsets->data().get(),
2961:                                mat->column_indices->data().get(), xptr, beta,
2962:                                dptr);CHKERRCUSPARSE(stat);
2963:      #endif
2964:     } else {
2965:       if (cusparsestruct->nrows) {
2966:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2967:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2968:        #else
2969:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2970:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2971:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2972:                                  xptr, beta,
2973:                                  dptr);CHKERRCUSPARSE(stat);
2974:        #endif
2975:       }
2976:     }
2977:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2978:     PetscLogGpuTimeEnd();

2980:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2981:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2982:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2983:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2984:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2985:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2986:         }
2987:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2988:         VecSet_SeqCUDA(zz,0);
2989:       }

2991:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2992:       if (compressed) {
2993:         PetscLogGpuTimeBegin();
2994:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2995:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2996:            prevent that. So I just add a ScatterAdd kernel.
2997:          */
2998:        #if 0
2999:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3000:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3001:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3002:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3003:                          VecCUDAPlusEquals());
3004:        #else
3005:         PetscInt n = matstruct->cprowIndices->size();
3006:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3007:        #endif
3008:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3009:         PetscLogGpuTimeEnd();
3010:       }
3011:     } else {
3012:       if (yy && yy != zz) {
3013:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3014:       }
3015:     }
3016:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3017:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3018:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
3019:   } catch(char *ex) {
3020:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3021:   }
3022:   if (yy) {
3023:     PetscLogGpuFlops(2.0*a->nz);
3024:   } else {
3025:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3026:   }
3027:   return(0);
3028: }

3030: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3031: {

3035:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3036:   return(0);
3037: }

3039: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3040: {
3041:   PetscErrorCode              ierr;
3042:   PetscSplitCSRDataStructure  *d_mat = NULL;
3044:   if (A->factortype == MAT_FACTOR_NONE) {
3045:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046:   }
3047:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
3048:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
3049:   if (d_mat) {
3050:     A->offloadmask = PETSC_OFFLOAD_GPU;
3051:   }

3053:   return(0);
3054: }

3056: /* --------------------------------------------------------------------------------*/
3057: /*@
3058:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3059:    (the default parallel PETSc format). This matrix will ultimately pushed down
3060:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3061:    assembly performance the user should preallocate the matrix storage by setting
3062:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3063:    performance during matrix assembly can be increased by more than a factor of 50.

3065:    Collective

3067:    Input Parameters:
3068: +  comm - MPI communicator, set to PETSC_COMM_SELF
3069: .  m - number of rows
3070: .  n - number of columns
3071: .  nz - number of nonzeros per row (same for all rows)
3072: -  nnz - array containing the number of nonzeros in the various rows
3073:          (possibly different for each row) or NULL

3075:    Output Parameter:
3076: .  A - the matrix

3078:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3079:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3080:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3082:    Notes:
3083:    If nnz is given then nz is ignored

3085:    The AIJ format (also called the Yale sparse matrix format or
3086:    compressed row storage), is fully compatible with standard Fortran 77
3087:    storage.  That is, the stored row and column indices can begin at
3088:    either one (as in Fortran) or zero.  See the users' manual for details.

3090:    Specify the preallocated storage with either nz or nnz (not both).
3091:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3092:    allocation.  For large problems you MUST preallocate memory or you
3093:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3095:    By default, this format uses inodes (identical nodes) when possible, to
3096:    improve numerical efficiency of matrix-vector products and solves. We
3097:    search for consecutive rows with the same nonzero structure, thereby
3098:    reusing matrix information to achieve increased efficiency.

3100:    Level: intermediate

3102: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3103: @*/
3104: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3105: {

3109:   MatCreate(comm,A);
3110:   MatSetSizes(*A,m,n,m,n);
3111:   MatSetType(*A,MATSEQAIJCUSPARSE);
3112:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3113:   return(0);
3114: }

3116: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3117: {
3118:   PetscErrorCode              ierr;
3119:   PetscSplitCSRDataStructure  *d_mat = NULL;

3122:   if (A->factortype == MAT_FACTOR_NONE) {
3123:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3124:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3125:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3126:   } else {
3127:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3128:   }
3129:   if (d_mat) {
3130:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3131:     cudaError_t                err;
3132:     PetscSplitCSRDataStructure h_mat;
3133:     PetscInfo(A,"Have device matrix\n");
3134:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3135:     if (a->compressedrow.use) {
3136:       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3137:     }
3138:     err = cudaFree(d_mat);CHKERRCUDA(err);
3139:   }
3140:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3141:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3142:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3143:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3144:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3145:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3146:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3147:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3148:   MatDestroy_SeqAIJ(A);
3149:   return(0);
3150: }

3152: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3153: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3154: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3155: {

3159:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3160:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3161:   return(0);
3162: }

3164: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3165: {
3166:   PetscErrorCode     ierr;
3167:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3168:   Mat_SeqAIJCUSPARSE *cy;
3169:   Mat_SeqAIJCUSPARSE *cx;
3170:   PetscScalar        *ay;
3171:   const PetscScalar  *ax;
3172:   CsrMatrix          *csry,*csrx;
3173:   cudaError_t        cerr;

3176:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3177:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3178:   if (X->ops->axpy != Y->ops->axpy) {
3179:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3180:     MatAXPY_SeqAIJ(Y,a,X,str);
3181:     return(0);
3182:   }
3183:   /* if we are here, it means both matrices are bound to GPU */
3184:   MatSeqAIJCUSPARSECopyToGPU(Y);
3185:   MatSeqAIJCUSPARSECopyToGPU(X);
3186:   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3187:   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3188:   csry = (CsrMatrix*)cy->mat->mat;
3189:   csrx = (CsrMatrix*)cx->mat->mat;
3190:   /* see if we can turn this into a cublas axpy */
3191:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3192:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3193:     if (eq) {
3194:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3195:     }
3196:     if (eq) str = SAME_NONZERO_PATTERN;
3197:   }
3198:   /* spgeam is buggy with one column */
3199:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3201:   if (str == SUBSET_NONZERO_PATTERN) {
3202:     cusparseStatus_t stat;
3203:     PetscScalar      b = 1.0;
3204: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3205:     size_t           bufferSize;
3206:     void             *buffer;
3207: #endif

3209:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3210:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3211:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3212: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3213:     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3214:                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3215:                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3216:                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3217:     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3218:     PetscLogGpuTimeBegin();
3219:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3220:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3221:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3222:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3223:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3224:     PetscLogGpuFlops(x->nz + y->nz);
3225:     PetscLogGpuTimeEnd();
3226:     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3227: #else
3228:     PetscLogGpuTimeBegin();
3229:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3230:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3231:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3232:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3233:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3234:     PetscLogGpuFlops(x->nz + y->nz);
3235:     PetscLogGpuTimeEnd();
3236: #endif
3237:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3238:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3239:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3240:     MatSeqAIJInvalidateDiagonal(Y);
3241:   } else if (str == SAME_NONZERO_PATTERN) {
3242:     cublasHandle_t cublasv2handle;
3243:     cublasStatus_t berr;
3244:     PetscBLASInt   one = 1, bnz = 1;

3246:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3247:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3248:     PetscCUBLASGetHandle(&cublasv2handle);
3249:     PetscBLASIntCast(x->nz,&bnz);
3250:     PetscLogGpuTimeBegin();
3251:     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3252:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3253:     PetscLogGpuFlops(2.0*bnz);
3254:     PetscLogGpuTimeEnd();
3255:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3256:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3257:     MatSeqAIJInvalidateDiagonal(Y);
3258:   } else {
3259:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3260:     MatAXPY_SeqAIJ(Y,a,X,str);
3261:   }
3262:   return(0);
3263: }

3265: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3266: {
3268:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3269:   PetscScalar    *ay;
3270:   cudaError_t    cerr;
3271:   cublasHandle_t cublasv2handle;
3272:   cublasStatus_t berr;
3273:   PetscBLASInt   one = 1, bnz = 1;

3276:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3277:   PetscCUBLASGetHandle(&cublasv2handle);
3278:   PetscBLASIntCast(y->nz,&bnz);
3279:   PetscLogGpuTimeBegin();
3280:   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3281:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3282:   PetscLogGpuFlops(bnz);
3283:   PetscLogGpuTimeEnd();
3284:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3285:   MatSeqAIJInvalidateDiagonal(Y);
3286:   return(0);
3287: }

3289: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3290: {
3291:   PetscErrorCode             ierr;
3292:   PetscBool                  both = PETSC_FALSE;
3293:   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;

3296:   if (A->factortype == MAT_FACTOR_NONE) {
3297:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3298:     if (spptr->mat) {
3299:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3300:       if (matrix->values) {
3301:         both = PETSC_TRUE;
3302:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3303:       }
3304:     }
3305:     if (spptr->matTranspose) {
3306:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3307:       if (matrix->values) {
3308:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3309:       }
3310:     }
3311:   }
3312:   //MatZeroEntries_SeqAIJ(A);
3313:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3314:   MatSeqAIJInvalidateDiagonal(A);
3315:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3316:   else A->offloadmask = PETSC_OFFLOAD_CPU;

3318:   return(0);
3319: }

3321: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3322: {
3323:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3327:   if (A->factortype != MAT_FACTOR_NONE) return(0);
3328:   if (flg) {
3329:     MatSeqAIJCUSPARSECopyFromGPU(A);

3331:     A->ops->scale                     = MatScale_SeqAIJ;
3332:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3333:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3334:     A->ops->mult                      = MatMult_SeqAIJ;
3335:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3336:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3337:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3338:     A->ops->multhermitiantranspose    = NULL;
3339:     A->ops->multhermitiantransposeadd = NULL;
3340:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3341:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3342:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3343:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3344:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3345:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3346:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3347:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3348:   } else {
3349:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3350:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3351:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3352:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3353:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3354:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3355:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3356:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3357:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3358:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3359:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3360:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3361:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3362:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3363:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3364:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3365:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3366:   }
3367:   A->boundtocpu = flg;
3368:   a->inode.use = flg;
3369:   return(0);
3370: }

3372: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3373: {
3374:   PetscErrorCode   ierr;
3375:   cusparseStatus_t stat;
3376:   Mat              B;

3379:   PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3380:   if (reuse == MAT_INITIAL_MATRIX) {
3381:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3382:   } else if (reuse == MAT_REUSE_MATRIX) {
3383:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3384:   }
3385:   B = *newmat;

3387:   PetscFree(B->defaultvectype);
3388:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3390:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3391:     if (B->factortype == MAT_FACTOR_NONE) {
3392:       Mat_SeqAIJCUSPARSE *spptr;
3393:       PetscNew(&spptr);
3394:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3395:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3396:       spptr->format     = MAT_CUSPARSE_CSR;
3397:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3398:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3399:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3400:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3401:      #endif
3402:       B->spptr = spptr;
3403:     } else {
3404:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3406:       PetscNew(&spptr);
3407:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3408:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3409:       B->spptr = spptr;
3410:     }
3411:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3412:   }
3413:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3414:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3415:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3416:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3417:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3418:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3420:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3421:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3422:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3423:   return(0);
3424: }

3426: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3427: {

3431:   MatCreate_SeqAIJ(B);
3432:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3433:   return(0);
3434: }

3436: /*MC
3437:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3439:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3440:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3441:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3443:    Options Database Keys:
3444: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3445: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3446: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

3448:   Level: beginner

3450: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3451: M*/

3453: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3455: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3456: {

3460:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3461:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3462:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3463:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3464:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3466:   return(0);
3467: }

3469: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3470: {
3471:   PetscErrorCode   ierr;
3472:   cusparseStatus_t stat;

3475:   if (*cusparsestruct) {
3476:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3477:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3478:     delete (*cusparsestruct)->workVector;
3479:     delete (*cusparsestruct)->rowoffsets_gpu;
3480:     delete (*cusparsestruct)->cooPerm;
3481:     delete (*cusparsestruct)->cooPerm_a;
3482:     delete (*cusparsestruct)->csr2csc_i;
3483:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3484:     PetscFree(*cusparsestruct);
3485:   }
3486:   return(0);
3487: }

3489: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3490: {
3492:   if (*mat) {
3493:     delete (*mat)->values;
3494:     delete (*mat)->column_indices;
3495:     delete (*mat)->row_offsets;
3496:     delete *mat;
3497:     *mat = 0;
3498:   }
3499:   return(0);
3500: }

3502: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3503: {
3504:   cusparseStatus_t stat;
3505:   PetscErrorCode   ierr;

3508:   if (*trifactor) {
3509:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3510:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3511:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3512:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3513:     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3514:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3515:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3516:    #endif
3517:     PetscFree(*trifactor);
3518:   }
3519:   return(0);
3520: }

3522: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3523: {
3524:   CsrMatrix        *mat;
3525:   cusparseStatus_t stat;
3526:   cudaError_t      err;

3529:   if (*matstruct) {
3530:     if ((*matstruct)->mat) {
3531:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3532:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3533:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3534:        #else
3535:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3536:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3537:        #endif
3538:       } else {
3539:         mat = (CsrMatrix*)(*matstruct)->mat;
3540:         CsrMatrix_Destroy(&mat);
3541:       }
3542:     }
3543:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3544:     delete (*matstruct)->cprowIndices;
3545:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3546:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3547:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

3549:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3550:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3551:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3552:     for (int i=0; i<3; i++) {
3553:       if (mdata->cuSpMV[i].initialized) {
3554:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3555:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3556:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3557:       }
3558:     }
3559:    #endif
3560:     delete *matstruct;
3561:     *matstruct = NULL;
3562:   }
3563:   return(0);
3564: }

3566: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3567: {

3571:   if (*trifactors) {
3572:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3573:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3574:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3575:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3576:     delete (*trifactors)->rpermIndices;
3577:     delete (*trifactors)->cpermIndices;
3578:     delete (*trifactors)->workVector;
3579:     (*trifactors)->rpermIndices = NULL;
3580:     (*trifactors)->cpermIndices = NULL;
3581:     (*trifactors)->workVector = NULL;
3582:     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3583:     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3584:   }
3585:   return(0);
3586: }

3588: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3589: {
3590:   PetscErrorCode   ierr;
3591:   cusparseHandle_t handle;
3592:   cusparseStatus_t stat;

3595:   if (*trifactors) {
3596:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3597:     if (handle = (*trifactors)->handle) {
3598:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3599:     }
3600:     PetscFree(*trifactors);
3601:   }
3602:   return(0);
3603: }

3605: struct IJCompare
3606: {
3607:   __host__ __device__
3608:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3609:   {
3610:     if (t1.get<0>() < t2.get<0>()) return true;
3611:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3612:     return false;
3613:   }
3614: };

3616: struct IJEqual
3617: {
3618:   __host__ __device__
3619:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3620:   {
3621:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3622:     return true;
3623:   }
3624: };

3626: struct IJDiff
3627: {
3628:   __host__ __device__
3629:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3630:   {
3631:     return t1 == t2 ? 0 : 1;
3632:   }
3633: };

3635: struct IJSum
3636: {
3637:   __host__ __device__
3638:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3639:   {
3640:     return t1||t2;
3641:   }
3642: };

3644: #include <thrust/iterator/discard_iterator.h>
3645: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3646: {
3647:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3648:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3649:   THRUSTARRAY                           *cooPerm_v = NULL;
3650:   thrust::device_ptr<const PetscScalar> d_v;
3651:   CsrMatrix                             *matrix;
3652:   PetscErrorCode                        ierr;
3653:   cudaError_t                           cerr;
3654:   PetscInt                              n;

3657:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3658:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3659:   if (!cusp->cooPerm) {
3660:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3661:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3662:     return(0);
3663:   }
3664:   matrix = (CsrMatrix*)cusp->mat->mat;
3665:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3666:   if (!v) {
3667:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3668:     goto finalize;
3669:   }
3670:   n = cusp->cooPerm->size();
3671:   if (isCudaMem(v)) {
3672:     d_v = thrust::device_pointer_cast(v);
3673:   } else {
3674:     cooPerm_v = new THRUSTARRAY(n);
3675:     cooPerm_v->assign(v,v+n);
3676:     d_v = cooPerm_v->data();
3677:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3678:   }
3679:   PetscLogGpuTimeBegin();
3680:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3681:     if (cusp->cooPerm_a) {
3682:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3683:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3684:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3685:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3686:       delete cooPerm_w;
3687:     } else {
3688:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3689:                                                                 matrix->values->begin()));
3690:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3691:                                                                 matrix->values->end()));
3692:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3693:     }
3694:   } else {
3695:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3696:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3697:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3698:     } else {
3699:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3700:                                                                 matrix->values->begin()));
3701:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3702:                                                                 matrix->values->end()));
3703:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3704:     }
3705:   }
3706:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3707:   PetscLogGpuTimeEnd();
3708: finalize:
3709:   delete cooPerm_v;
3710:   A->offloadmask = PETSC_OFFLOAD_GPU;
3711:   PetscObjectStateIncrease((PetscObject)A);
3712:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3713:   PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3714:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3715:   PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3716:   a->reallocs         = 0;
3717:   A->info.mallocs    += 0;
3718:   A->info.nz_unneeded = 0;
3719:   A->assembled = A->was_assembled = PETSC_TRUE;
3720:   A->num_ass++;
3721:   return(0);
3722: }

3724: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3725: {
3726:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3727:   PetscErrorCode     ierr;

3731:   if (!cusp) return(0);
3732:   if (destroy) {
3733:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3734:     delete cusp->csr2csc_i;
3735:     cusp->csr2csc_i = NULL;
3736:   }
3737:   A->transupdated = PETSC_FALSE;
3738:   return(0);
3739: }

3741: #include <thrust/binary_search.h>
3742: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3743: {
3744:   PetscErrorCode     ierr;
3745:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3746:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3747:   PetscInt           cooPerm_n, nzr = 0;
3748:   cudaError_t        cerr;

3751:   PetscLayoutSetUp(A->rmap);
3752:   PetscLayoutSetUp(A->cmap);
3753:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3754:   if (n != cooPerm_n) {
3755:     delete cusp->cooPerm;
3756:     delete cusp->cooPerm_a;
3757:     cusp->cooPerm = NULL;
3758:     cusp->cooPerm_a = NULL;
3759:   }
3760:   if (n) {
3761:     THRUSTINTARRAY d_i(n);
3762:     THRUSTINTARRAY d_j(n);
3763:     THRUSTINTARRAY ii(A->rmap->n);

3765:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3766:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3768:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3769:     d_i.assign(coo_i,coo_i+n);
3770:     d_j.assign(coo_j,coo_j+n);
3771:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3772:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3774:     PetscLogGpuTimeBegin();
3775:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3776:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3777:     *cusp->cooPerm_a = d_i;
3778:     THRUSTINTARRAY w = d_j;

3780:     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3781:     if (nekey == ekey) { /* all entries are unique */
3782:       delete cusp->cooPerm_a;
3783:       cusp->cooPerm_a = NULL;
3784:     } else { /* I couldn't come up with a more elegant algorithm */
3785:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3786:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3787:       (*cusp->cooPerm_a)[0] = 0;
3788:       w[0] = 0;
3789:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3790:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3791:     }
3792:     thrust::counting_iterator<PetscInt> search_begin(0);
3793:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3794:                         search_begin, search_begin + A->rmap->n,
3795:                         ii.begin());
3796:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3797:     PetscLogGpuTimeEnd();

3799:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3800:     a->singlemalloc = PETSC_FALSE;
3801:     a->free_a       = PETSC_TRUE;
3802:     a->free_ij      = PETSC_TRUE;
3803:     PetscMalloc1(A->rmap->n+1,&a->i);
3804:     a->i[0] = 0;
3805:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3806:     a->nz = a->maxnz = a->i[A->rmap->n];
3807:     a->rmax = 0;
3808:     PetscMalloc1(a->nz,&a->a);
3809:     PetscMalloc1(a->nz,&a->j);
3810:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3811:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3812:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3813:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3814:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3815:       nzr += (PetscInt)!!(nnzr);
3816:       a->ilen[i] = a->imax[i] = nnzr;
3817:       a->rmax = PetscMax(a->rmax,nnzr);
3818:     }
3819:     a->nonzerorowcnt = nzr;
3820:     A->preallocated = PETSC_TRUE;
3821:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3822:     MatMarkDiagonal_SeqAIJ(A);
3823:   } else {
3824:     MatSeqAIJSetPreallocation(A,0,NULL);
3825:   }
3826:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3828:   /* We want to allocate the CUSPARSE struct for matvec now.
3829:      The code is so convoluted now that I prefer to copy zeros */
3830:   PetscArrayzero(a->a,a->nz);
3831:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3832:   A->offloadmask = PETSC_OFFLOAD_CPU;
3833:   A->nonzerostate++;
3834:   MatSeqAIJCUSPARSECopyToGPU(A);
3835:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);

3837:   A->assembled = PETSC_FALSE;
3838:   A->was_assembled = PETSC_FALSE;
3839:   return(0);
3840: }

3842: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3843: {
3844:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845:   CsrMatrix          *csr;
3846:   PetscErrorCode     ierr;

3852:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3853:   MatSeqAIJCUSPARSECopyToGPU(A);
3854:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3855:   csr = (CsrMatrix*)cusp->mat->mat;
3856:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3857:   *a = csr->values->data().get();
3858:   return(0);
3859: }

3861: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3862: {
3867:   *a = NULL;
3868:   return(0);
3869: }

3871: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3872: {
3873:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874:   CsrMatrix          *csr;
3875:   PetscErrorCode     ierr;

3881:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3882:   MatSeqAIJCUSPARSECopyToGPU(A);
3883:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3884:   csr = (CsrMatrix*)cusp->mat->mat;
3885:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3886:   *a = csr->values->data().get();
3887:   A->offloadmask = PETSC_OFFLOAD_GPU;
3888:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3889:   return(0);
3890: }

3892: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3893: {

3900:   PetscObjectStateIncrease((PetscObject)A);
3901:   *a = NULL;
3902:   return(0);
3903: }

3905: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3906: {
3907:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3908:   CsrMatrix          *csr;
3909:   PetscErrorCode     ierr;

3915:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3916:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3917:   csr = (CsrMatrix*)cusp->mat->mat;
3918:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3919:   *a = csr->values->data().get();
3920:   A->offloadmask = PETSC_OFFLOAD_GPU;
3921:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3922:   return(0);
3923: }

3925: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3926: {

3933:   PetscObjectStateIncrease((PetscObject)A);
3934:   *a = NULL;
3935:   return(0);
3936: }

3938: struct IJCompare4
3939: {
3940:   __host__ __device__
3941:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3942:   {
3943:     if (t1.get<0>() < t2.get<0>()) return true;
3944:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3945:     return false;
3946:   }
3947: };

3949: struct Shift
3950: {
3951:   int _shift;

3953:   Shift(int shift) : _shift(shift) {}
3954:   __host__ __device__
3955:   inline int operator() (const int &c)
3956:   {
3957:     return c + _shift;
3958:   }
3959: };

3961: /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3962: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3963: {
3964:   PetscErrorCode               ierr;
3965:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3966:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3967:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3968:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3969:   PetscInt                     Annz,Bnnz;
3970:   cusparseStatus_t             stat;
3971:   PetscInt                     i,m,n,zero = 0;
3972:   cudaError_t                  cerr;

3980:   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3981:   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3982:   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3983:   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3984:   if (reuse == MAT_INITIAL_MATRIX) {
3985:     m     = A->rmap->n;
3986:     n     = A->cmap->n + B->cmap->n;
3987:     MatCreate(PETSC_COMM_SELF,C);
3988:     MatSetSizes(*C,m,n,m,n);
3989:     MatSetType(*C,MATSEQAIJCUSPARSE);
3990:     c     = (Mat_SeqAIJ*)(*C)->data;
3991:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3992:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3993:     Ccsr  = new CsrMatrix;
3994:     Cmat->cprowIndices      = NULL;
3995:     c->compressedrow.use    = PETSC_FALSE;
3996:     c->compressedrow.nrows  = 0;
3997:     c->compressedrow.i      = NULL;
3998:     c->compressedrow.rindex = NULL;
3999:     Ccusp->workVector       = NULL;
4000:     Ccusp->nrows    = m;
4001:     Ccusp->mat      = Cmat;
4002:     Ccusp->mat->mat = Ccsr;
4003:     Ccsr->num_rows  = m;
4004:     Ccsr->num_cols  = n;
4005:     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4006:     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4007:     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4008:     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4009:     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4010:     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4011:     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4012:     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4013:     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4014:     MatSeqAIJCUSPARSECopyToGPU(A);
4015:     MatSeqAIJCUSPARSECopyToGPU(B);
4016:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
4017:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
4018:     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4019:     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");

4021:     Acsr = (CsrMatrix*)Acusp->mat->mat;
4022:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4023:     Annz = (PetscInt)Acsr->column_indices->size();
4024:     Bnnz = (PetscInt)Bcsr->column_indices->size();
4025:     c->nz = Annz + Bnnz;
4026:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4027:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4028:     Ccsr->values = new THRUSTARRAY(c->nz);
4029:     Ccsr->num_entries = c->nz;
4030:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4031:     if (c->nz) {
4032:       auto Acoo = new THRUSTINTARRAY32(Annz);
4033:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4034:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4035:       THRUSTINTARRAY32 *Aroff,*Broff;

4037:       if (a->compressedrow.use) { /* need full row offset */
4038:         if (!Acusp->rowoffsets_gpu) {
4039:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4040:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4041:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4042:         }
4043:         Aroff = Acusp->rowoffsets_gpu;
4044:       } else Aroff = Acsr->row_offsets;
4045:       if (b->compressedrow.use) { /* need full row offset */
4046:         if (!Bcusp->rowoffsets_gpu) {
4047:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4048:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4049:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4050:         }
4051:         Broff = Bcusp->rowoffsets_gpu;
4052:       } else Broff = Bcsr->row_offsets;
4053:       PetscLogGpuTimeBegin();
4054:       stat = cusparseXcsr2coo(Acusp->handle,
4055:                               Aroff->data().get(),
4056:                               Annz,
4057:                               m,
4058:                               Acoo->data().get(),
4059:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4060:       stat = cusparseXcsr2coo(Bcusp->handle,
4061:                               Broff->data().get(),
4062:                               Bnnz,
4063:                               m,
4064:                               Bcoo->data().get(),
4065:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4066:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4067:       auto Aperm = thrust::make_constant_iterator(1);
4068:       auto Bperm = thrust::make_constant_iterator(0);
4069: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4070:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4071:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4072: #else
4073:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4074:       auto Bcib = Bcsr->column_indices->begin();
4075:       auto Bcie = Bcsr->column_indices->end();
4076:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4077: #endif
4078:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4079:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4080:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4081:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4082:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4083:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4084:       auto p1 = Ccusp->cooPerm->begin();
4085:       auto p2 = Ccusp->cooPerm->begin();
4086:       thrust::advance(p2,Annz);
4087:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4088: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4089:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4090: #endif
4091:       auto cci = thrust::make_counting_iterator(zero);
4092:       auto cce = thrust::make_counting_iterator(c->nz);
4093: #if 0 //Errors on SUMMIT cuda 11.1.0
4094:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4095: #else
4096:       auto pred = thrust::identity<int>();
4097:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4098:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4099: #endif
4100:       stat = cusparseXcoo2csr(Ccusp->handle,
4101:                               Ccoo->data().get(),
4102:                               c->nz,
4103:                               m,
4104:                               Ccsr->row_offsets->data().get(),
4105:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4106:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4107:       PetscLogGpuTimeEnd();
4108:       delete wPerm;
4109:       delete Acoo;
4110:       delete Bcoo;
4111:       delete Ccoo;
4112: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4113:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4114:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4115:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4116:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4117: #endif
4118:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4119:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4120:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4121:         CsrMatrix *CcsrT = new CsrMatrix;
4122:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4123:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4125:         (*C)->form_explicit_transpose = PETSC_TRUE;
4126:         (*C)->transupdated = PETSC_TRUE;
4127:         Ccusp->rowoffsets_gpu = NULL;
4128:         CmatT->cprowIndices = NULL;
4129:         CmatT->mat = CcsrT;
4130:         CcsrT->num_rows = n;
4131:         CcsrT->num_cols = m;
4132:         CcsrT->num_entries = c->nz;

4134:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4135:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4136:         CcsrT->values = new THRUSTARRAY(c->nz);

4138:         PetscLogGpuTimeBegin();
4139:         auto rT = CcsrT->row_offsets->begin();
4140:         if (AT) {
4141:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4142:           thrust::advance(rT,-1);
4143:         }
4144:         if (BT) {
4145:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4146:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4147:           thrust::copy(titb,tite,rT);
4148:         }
4149:         auto cT = CcsrT->column_indices->begin();
4150:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4151:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4152:         auto vT = CcsrT->values->begin();
4153:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4154:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4155:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4156:         PetscLogGpuTimeEnd();

4158:         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4159:         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4160:         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4161:         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4162:         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4163:         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4164:         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4165:         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4166:         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4167: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4168:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4169:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4170:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4171:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4172: #endif
4173:         Ccusp->matTranspose = CmatT;
4174:       }
4175:     }

4177:     c->singlemalloc = PETSC_FALSE;
4178:     c->free_a       = PETSC_TRUE;
4179:     c->free_ij      = PETSC_TRUE;
4180:     PetscMalloc1(m+1,&c->i);
4181:     PetscMalloc1(c->nz,&c->j);
4182:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4183:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4184:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4185:       ii   = *Ccsr->row_offsets;
4186:       jj   = *Ccsr->column_indices;
4187:       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4188:       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4189:     } else {
4190:       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4191:       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4192:     }
4193:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4194:     PetscMalloc1(m,&c->ilen);
4195:     PetscMalloc1(m,&c->imax);
4196:     c->maxnz = c->nz;
4197:     c->nonzerorowcnt = 0;
4198:     c->rmax = 0;
4199:     for (i = 0; i < m; i++) {
4200:       const PetscInt nn = c->i[i+1] - c->i[i];
4201:       c->ilen[i] = c->imax[i] = nn;
4202:       c->nonzerorowcnt += (PetscInt)!!nn;
4203:       c->rmax = PetscMax(c->rmax,nn);
4204:     }
4205:     MatMarkDiagonal_SeqAIJ(*C);
4206:     PetscMalloc1(c->nz,&c->a);
4207:     (*C)->nonzerostate++;
4208:     PetscLayoutSetUp((*C)->rmap);
4209:     PetscLayoutSetUp((*C)->cmap);
4210:     Ccusp->nonzerostate = (*C)->nonzerostate;
4211:     (*C)->preallocated  = PETSC_TRUE;
4212:   } else {
4213:     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4214:     c = (Mat_SeqAIJ*)(*C)->data;
4215:     if (c->nz) {
4216:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4217:       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4218:       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4219:       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4220:       MatSeqAIJCUSPARSECopyToGPU(A);
4221:       MatSeqAIJCUSPARSECopyToGPU(B);
4222:       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4223:       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4224:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4225:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4226:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4227:       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4228:       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4229:       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4230:       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4231:       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4232:       auto pmid = Ccusp->cooPerm->begin();
4233:       thrust::advance(pmid,Acsr->num_entries);
4234:       PetscLogGpuTimeBegin();
4235:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4236:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4237:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4238:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4239:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4240:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4241:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4242:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4243:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4244:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4245:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4246:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4247:         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4248:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4249:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4250:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4251:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4252:         auto vT = CcsrT->values->begin();
4253:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4254:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4255:         (*C)->transupdated = PETSC_TRUE;
4256:       }
4257:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4258:       PetscLogGpuTimeEnd();
4259:     }
4260:   }
4261:   PetscObjectStateIncrease((PetscObject)*C);
4262:   (*C)->assembled     = PETSC_TRUE;
4263:   (*C)->was_assembled = PETSC_FALSE;
4264:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4265:   return(0);
4266: }

4268: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4269: {
4270:   PetscErrorCode    ierr;
4271:   bool              dmem;
4272:   const PetscScalar *av;
4273:   cudaError_t       cerr;

4276:   dmem = isCudaMem(v);
4277:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4278:   if (n && idx) {
4279:     THRUSTINTARRAY widx(n);
4280:     widx.assign(idx,idx+n);
4281:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4283:     THRUSTARRAY *w = NULL;
4284:     thrust::device_ptr<PetscScalar> dv;
4285:     if (dmem) {
4286:       dv = thrust::device_pointer_cast(v);
4287:     } else {
4288:       w = new THRUSTARRAY(n);
4289:       dv = w->data();
4290:     }
4291:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4293:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4294:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4295:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4296:     if (w) {
4297:       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4298:     }
4299:     delete w;
4300:   } else {
4301:     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4302:   }
4303:   if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4304:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4305:   return(0);
4306: }

4308: /*
4309:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

4311:   requires:
4312:      structurally symmetric: fix with transpose/column meta data
4313: */

4315: /*
4316:   The GPU LU factor kernel
4317: */
4318: __global__
4319: void __launch_bounds__(1024,1)
4320: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4321: {
4322:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4323:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4324:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

4326:   // set i (row+1)
4327:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4328:   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4329:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4330:     if (rowb < end_i && threadIdx.x==0) {
4331:       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4332:       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4333:     }
4334:   }
4335: }
4336: // copy AIJ to AIJ_BAND
4337: __global__
4338: void __launch_bounds__(1024,1)
4339: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4340:                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4341:                                 const int bi_csr[], PetscScalar ba_csr[])
4342: {
4343:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4344:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4345:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

4347:   // zero B
4348:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4349:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4350:     if (rowb < end_i) {
4351:       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4352:       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4353:       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4354:         if (j<nzb) {
4355:           batmp[j] = 0;
4356:         }
4357:       }
4358:     }
4359:   }

4361:   // copy A into B with CSR format -- these two loops can be fused
4362:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4363:     if (rowb < end_i) {
4364:       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4365:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4366:       const PetscScalar *av    = aa_d + ai_d[rowa];
4367:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4368:       /* load in initial (unfactored row) */
4369:       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4370:         if (j<nza) {
4371:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4372:           PetscScalar vala = av[j];
4373:           batmp[idx] = vala;
4374:         }
4375:       }
4376:     }
4377:   }
4378: }
4379: // print AIJ_BAND
4380: __global__
4381: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4382: {
4383:   // debug
4384:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4385:     printf("B (AIJ) n=%d:\n",(int)n);
4386:     for (int rowb=0;rowb<n;rowb++) {
4387:       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4388:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4389:       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4390:       printf(" bi=%d\n",bi_csr[rowb+1]);
4391:     }
4392:   }
4393: }
4394: // Band LU kernel ---  ba_csr bi_csr
4395: __global__
4396: void __launch_bounds__(1024,1)
4397: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4398: {
4399:   extern __shared__ PetscInt smemInt[];
4400:   PetscInt        *sm_pkIdx  = &smemInt[0];
4401:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4402:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4403:   const PetscInt  start = field*nloc, end = start + nloc;
4404: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4405:   auto g = cooperative_groups::this_grid();
4406: #endif
4407:   // A22 panel update for each row A(1,:) and col A(:,1)
4408:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4409:     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4410:     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4411:     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4412:     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4413:     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4414:     const PetscScalar Bdd = *pBdd;
4415:     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4416:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4417:       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4418:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4419:         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4420:         *Aid = *Aid/Bdd;
4421:         sm_pkIdx[threadIdx.y] = kIdx;
4422:       }
4423:       __syncthreads(); // synch on threadIdx.x only
4424:       if (idx < nzUd) { /* assuming symmetric structure */
4425:         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4426:         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4427:         PetscScalar *Aij =  Aid + 1;
4428:         PetscScalar Lid  = *Aid;
4429:         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4430:           if (jIdx<nzUd) {
4431:             Aij[jIdx] -= Lid*baUd[jIdx];
4432:           }
4433:         }
4434:       }
4435:     }
4436: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4437:     g.sync();
4438: #else
4439:     __syncthreads();
4440: #endif
4441:   } /* endof for (i=0; i<n; i++) { */
4442: }

4444: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4445: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4446: {
4447:   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4448:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4449:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4450:   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4451:   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4452:   CsrMatrix                    *matrixA;
4453:   PetscErrorCode               ierr;
4454:   cudaError_t                  cerr;
4455:   const PetscInt               n=A->rmap->n, *ic, *r;
4456:   const int                    *ai_d, *aj_d;
4457:   const PetscScalar            *aa_d;
4458:   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4459:   int                          *bi_t = cusparseTriFactors->i_band_d;
4460:   PetscContainer               container;
4461:   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;

4464:   if (A->rmap->n == 0) {
4465:     return(0);
4466:   }
4467:   // cusparse setup
4468:   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4469:   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4470:   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4471:   matrixA = (CsrMatrix*)matstructA->mat;
4472:   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");

4474:   // factor: get Nf if available
4475:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4476:   if (container) {
4477:     PetscInt *pNf=NULL;
4478:     PetscContainerGetPointer(container, (void **) &pNf);
4479:     Nf = (*pNf)%1000;
4480:     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4481:   } else Nf = 1;
4482:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4484:   // get data
4485:   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4486:   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4487:   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4488:   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4489:   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());

4491:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4492:   PetscLogGpuTimeBegin();
4493:   {
4494:     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4495:     int gpuid;
4496:     cudaDeviceProp prop;
4497:     cudaGetDevice(&gpuid);
4498:     cudaGetDeviceProperties(&prop, gpuid);
4499: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4500:     Ni = 1/nconcurrent;
4501:     Ni = 1;
4502: #else
4503:     nsm = prop.multiProcessorCount;
4504:     Ni = nsm/Nf/nconcurrent;
4505: #endif
4506:     team_size = bw/Ni + !!(bw%Ni);
4507:     nVec = PetscMin(bw, 1024/team_size);
4508:     PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);
4509:     {
4510:       dim3 dimBlockTeam(nVec,team_size);
4511:       dim3 dimBlockLeague(Nf,Ni);
4512:       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4513:       CHECK_LAUNCH_ERROR(); // does a sync
4514: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515:       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4516:       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4517: #else
4518:       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4519: #endif
4520:       CHECK_LAUNCH_ERROR(); // does a sync
4521: #if defined(PETSC_USE_LOG)
4522:       PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));
4523: #endif
4524:     }
4525:   }
4526:   PetscLogGpuTimeEnd();

4528:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4529:   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4530:   B->ops->solvetranspose = NULL; // need transpose
4531:   B->ops->matsolve = NULL;
4532:   B->ops->matsolvetranspose = NULL;

4534:   return(0);
4535: }

4537: static PetscErrorCode MatrixNfDestroy(void *ptr)
4538: {
4539:   PetscInt *nf = (PetscInt *)ptr;
4540:   PetscErrorCode  ierr;
4542:   PetscFree(nf);
4543:   return(0);
4544: }

4546: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4547: {
4548:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4549:   IS                 isicol;
4550:   PetscErrorCode     ierr;
4551:   cudaError_t        cerr;
4552:   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4553:   PetscScalar        *ba_t;
4554:   int                *bi_t;
4555:   PetscInt           i,n=A->rmap->n,Nf;
4556:   PetscInt           nzBcsr,bwL,bwU;
4557:   PetscBool          missing;
4558:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4559:   PetscContainer               container;

4562:   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4563:   MatMissingDiagonal(A,&missing,&i);
4564:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4565:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4566:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
4567:   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");

4569:    // factor: get Nf if available
4570:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4571:   if (container) {
4572:     PetscInt *pNf=NULL;
4573:     PetscContainerGetPointer(container, (void **) &pNf);
4574:     Nf = (*pNf)%1000;
4575:     PetscContainerCreate(PETSC_COMM_SELF, &container);
4576:     PetscMalloc(sizeof(PetscInt), &pNf);
4577:     *pNf = Nf;
4578:     PetscContainerSetPointer(container, (void *)pNf);
4579:     PetscContainerSetUserDestroy(container, MatrixNfDestroy);
4580:     PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
4581:     PetscContainerDestroy(&container);
4582:   } else Nf = 1;
4583:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4585:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
4586:   ISGetIndices(isicol,&ic);

4588:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
4589:   PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
4590:   b    = (Mat_SeqAIJ*)(B)->data;

4592:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4593:   bwL = bwU = 0;
4594:   for (int rwb=0; rwb<n; rwb++) {
4595:     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4596:     for (int j=0;j<anz;j++) {
4597:       PetscInt colb = ic[ajtmp[j]];
4598:       if (colb<rwa) { // L
4599:         if (rwa-colb > bwL) bwL = rwa-colb;
4600:       } else {
4601:         if (colb-rwa > bwU) bwU = colb-rwa;
4602:       }
4603:     }
4604:   }
4605:   ISRestoreIndices(isicol,&ic);
4606:   /* only support structurally symmetric, but it might work */
4607:   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4608:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
4609:   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4610:   b->maxnz = b->nz = nzBcsr;
4611:   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4612:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4613:   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4614:   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4615:   cusparseTriFactors->a_band_d = ba_t;
4616:   cusparseTriFactors->i_band_d = bi_t;
4617:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4618:   PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
4619:   {
4620:     dim3 dimBlockTeam(1,128);
4621:     dim3 dimBlockLeague(Nf,1);
4622:     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4623:   }
4624:   CHECK_LAUNCH_ERROR(); // does a sync

4626:   // setup data
4627:   if (!cusparseTriFactors->rpermIndices) {
4628:     const PetscInt *r;

4630:     ISGetIndices(isrow,&r);
4631:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4632:     cusparseTriFactors->rpermIndices->assign(r, r+n);
4633:     ISRestoreIndices(isrow,&r);
4634:     PetscLogCpuToGpu(n*sizeof(PetscInt));
4635:   }
4636:   /* upper triangular indices */
4637:   if (!cusparseTriFactors->cpermIndices) {
4638:     const PetscInt *c;

4640:     ISGetIndices(isicol,&c);
4641:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4642:     cusparseTriFactors->cpermIndices->assign(c, c+n);
4643:     ISRestoreIndices(isicol,&c);
4644:     PetscLogCpuToGpu(n*sizeof(PetscInt));
4645:   }

4647:   /* put together the new matrix */
4648:   b->free_a       = PETSC_FALSE;
4649:   b->free_ij      = PETSC_FALSE;
4650:   b->singlemalloc = PETSC_FALSE;
4651:   b->ilen = NULL;
4652:   b->imax = NULL;
4653:   b->row  = isrow;
4654:   b->col  = iscol;
4655:   PetscObjectReference((PetscObject)isrow);
4656:   PetscObjectReference((PetscObject)iscol);
4657:   b->icol = isicol;
4658:   PetscMalloc1(n+1,&b->solve_work);

4660:   B->factortype            = MAT_FACTOR_LU;
4661:   B->info.factor_mallocs   = 0;
4662:   B->info.fill_ratio_given = 0;

4664:   if (ai[n]) {
4665:     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4666:   } else {
4667:     B->info.fill_ratio_needed = 0.0;
4668:   }
4669: #if defined(PETSC_USE_INFO)
4670:   if (ai[n] != 0) {
4671:     PetscReal af = B->info.fill_ratio_needed;
4672:     PetscInfo1(A,"Band fill ratio %g\n",(double)af);
4673:   } else {
4674:     PetscInfo(A,"Empty matrix\n");
4675:   }
4676: #endif
4677:   if (a->inode.size) {
4678:     PetscInfo(A,"Warning: using inodes in band solver.\n");
4679:   }
4680:   MatSeqAIJCheckInode_FactorLU(B);
4681:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4682:   B->offloadmask = PETSC_OFFLOAD_GPU;

4684:   return(0);
4685: }

4687: /* Use -pc_factor_mat_solver_type cusparseband */
4688: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4689: {
4691:   *type = MATSOLVERCUSPARSEBAND;
4692:   return(0);
4693: }

4695: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4696: {
4698:   PetscInt       n = A->rmap->n;

4701:   MatCreate(PetscObjectComm((PetscObject)A),B);
4702:   MatSetSizes(*B,n,n,n,n);
4703:   (*B)->factortype = ftype;
4704:   (*B)->useordering = PETSC_TRUE;
4705:   MatSetType(*B,MATSEQAIJCUSPARSE);

4707:   if (ftype == MAT_FACTOR_LU) {
4708:     MatSetBlockSizesFromMats(*B,A,A);
4709:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4710:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4711:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");

4713:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
4714:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
4715:   return(0);
4716: }

4718: #define WARP_SIZE 32
4719: template <typename T>
4720: __forceinline__ __device__
4721: T wreduce(T a)
4722: {
4723:   T b;
4724:   #pragma unroll
4725:   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4726:     b = __shfl_down_sync(0xffffffff, a, i);
4727:     a += b;
4728:   }
4729:   return a;
4730: }
4731: // reduce in a block, returns result in thread 0
4732: template <typename T, int BLOCK_SIZE>
4733: __device__
4734: T breduce(T a)
4735: {
4736:   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4737:   __shared__ double buf[NWARP];
4738:   int wid = threadIdx.x / WARP_SIZE;
4739:   int laneid = threadIdx.x % WARP_SIZE;
4740:   T b = wreduce<T>(a);
4741:   if (laneid == 0)
4742:     buf[wid] = b;
4743:   __syncthreads();
4744:   if (wid == 0) {
4745:     if (threadIdx.x < NWARP)
4746:       a = buf[threadIdx.x];
4747:     else
4748:       a = 0;
4749:     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4750:       a += __shfl_down_sync(0xffffffff, a, i);
4751:     }
4752:   }
4753:   return a;
4754: }


4757: // Band LU kernel ---  ba_csr bi_csr
4758: template <int BLOCK_SIZE>
4759: __global__
4760: void __launch_bounds__(256,1)
4761: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4762: {
4763:   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4764:   const PetscScalar *pLi;
4765:   const int tid = threadIdx.x;

4767:   /* Next, solve L */
4768:   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4769:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4770:     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4771:     PetscScalar t = 0;
4772:     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4773:       t += pLi[idx]*x[j];
4774:     }
4775: #if defined(PETSC_USE_COMPLEX)
4776:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4777:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4778:     t = tt;
4779: #else
4780:     t = breduce<PetscReal,BLOCK_SIZE>(t);
4781: #endif
4782:     if (threadIdx.x == 0)
4783:       x[glbDD] -= t; // /1.0
4784:     __syncthreads();
4785:     // inc
4786:     pLi += glbDD-col; // get to diagonal
4787:     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4788:     else pLi += bw;
4789:     pLi += 1; // skip to next row
4790:     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4791:   }
4792:   /* Then, solve U */
4793:   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4794:   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4795:   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4796:     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4797:     PetscScalar t = 0;
4798:     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4799:       t += pLi[-idx]*x[j];
4800:     }
4801: #if defined(PETSC_USE_COMPLEX)
4802:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4803:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4804:     t = tt;
4805: #else
4806:     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4807: #endif
4808:     pLi -= col-glbDD; // diagonal
4809:     if (threadIdx.x == 0) {
4810:       x[glbDD] -= t;
4811:       x[glbDD] /= pLi[0];
4812:     }
4813:     __syncthreads();
4814:     // inc past L to start of previous U
4815:     pLi -= bw+1;
4816:     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4817:     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4818:   }
4819: }

4821: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4822: {
4823:   const PetscScalar                     *barray;
4824:   PetscScalar                           *xarray;
4825:   thrust::device_ptr<const PetscScalar> bGPU;
4826:   thrust::device_ptr<PetscScalar>       xGPU;
4827:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4828:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4829:   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4830:   PetscErrorCode                        ierr;
4831:   cudaError_t                           cerr;
4832:   PetscContainer                        container;

4835:   if (A->rmap->n == 0) {
4836:     return(0);
4837:   }
4838:   // factor: get Nf if available
4839:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4840:   if (container) {
4841:     PetscInt *pNf=NULL;
4842:     PetscContainerGetPointer(container, (void **) &pNf);
4843:     Nf = (*pNf)%1000;
4844:   } else Nf = 1;
4845:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4847:   /* Get the GPU pointers */
4848:   VecCUDAGetArrayWrite(xx,&xarray);
4849:   VecCUDAGetArrayRead(bb,&barray);
4850:   xGPU = thrust::device_pointer_cast(xarray);
4851:   bGPU = thrust::device_pointer_cast(barray);

4853:   PetscLogGpuTimeBegin();
4854:   /* First, reorder with the row permutation */
4855:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4856:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4857:                tempGPU->begin());
4858:   constexpr int block = 128;
4859:   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4860:   CHECK_LAUNCH_ERROR(); // does a sync

4862:   /* Last, reorder with the column permutation */
4863:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4864:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4865:                xGPU);

4867:   VecCUDARestoreArrayRead(bb,&barray);
4868:   VecCUDARestoreArrayWrite(xx,&xarray);
4869:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4870:   PetscLogGpuTimeEnd();
4871:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
4872:   return(0);
4873: }