petsc-3.14.6 2021-03-30
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 19:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 20:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 22:   typedef enum {
 23:       CUSPARSE_MV_ALG_DEFAULT = 0,
 24:       CUSPARSE_COOMV_ALG      = 1,
 25:       CUSPARSE_CSRMV_ALG1     = 2,
 26:       CUSPARSE_CSRMV_ALG2     = 3
 27:   } cusparseSpMVAlg_t;

 29:   typedef enum {
 30:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 31:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 32:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 33:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 34:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 35:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 36:       CUSPARSE_SPMM_COO_ALG1    = 1,
 37:       CUSPARSE_SPMM_COO_ALG2    = 2,
 38:       CUSPARSE_SPMM_COO_ALG3    = 3,
 39:       CUSPARSE_SPMM_COO_ALG4    = 5,
 40:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 41:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 42:   } cusparseSpMMAlg_t;

 44:   typedef enum {
 45:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 46:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 47:   } cusparseCsr2CscAlg_t;
 48:   */
 49:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 50:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 51:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 52: #endif

 54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 67: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 69: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 71: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 72: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 73: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 75: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 76: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 77: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 78: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 79: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 80: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 82: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 83: {
 84:   cusparseStatus_t   stat;
 85:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 88:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
 89:   cusparsestruct->stream = stream;
 90:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 91:   return(0);
 92: }

 94: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 95: {
 96:   cusparseStatus_t   stat;
 97:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

100:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101:   if (cusparsestruct->handle != handle) {
102:     if (cusparsestruct->handle) {
103:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
104:     }
105:     cusparsestruct->handle = handle;
106:   }
107:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
108:   return(0);
109: }

111: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
112: {
113:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

116:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
118:   return(0);
119: }

121: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
122: {
124:   *type = MATSOLVERCUSPARSE;
125:   return(0);
126: }

128: /*MC
129:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
130:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
131:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
132:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
133:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
134:   algorithms are not recommended. This class does NOT support direct solver operations.

136:   Level: beginner

138: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
139: M*/

141: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
142: {
144:   PetscInt       n = A->rmap->n;

147:   MatCreate(PetscObjectComm((PetscObject)A),B);
148:   MatSetSizes(*B,n,n,n,n);
149:   (*B)->factortype = ftype;
150:   (*B)->useordering = PETSC_TRUE;
151:   MatSetType(*B,MATSEQAIJCUSPARSE);

153:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
154:     MatSetBlockSizesFromMats(*B,A,A);
155:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
156:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
157:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
158:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
159:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
160:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

162:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
163:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
164:   return(0);
165: }

167: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
168: {
169:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

172:   switch (op) {
173:   case MAT_CUSPARSE_MULT:
174:     cusparsestruct->format = format;
175:     break;
176:   case MAT_CUSPARSE_ALL:
177:     cusparsestruct->format = format;
178:     break;
179:   default:
180:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
181:   }
182:   return(0);
183: }

185: /*@
186:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
187:    operation. Only the MatMult operation can use different GPU storage formats
188:    for MPIAIJCUSPARSE matrices.
189:    Not Collective

191:    Input Parameters:
192: +  A - Matrix of type SEQAIJCUSPARSE
193: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
194: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

196:    Output Parameter:

198:    Level: intermediate

200: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
201: @*/
202: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
203: {

208:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
209:   return(0);
210: }

212: /*@
213:    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose

215:    Collective on mat

217:    Input Parameters:
218: +  A - Matrix of type SEQAIJCUSPARSE
219: -  transgen - the boolean flag

221:    Level: intermediate

223: .seealso: MATSEQAIJCUSPARSE
224: @*/
225: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
226: {
228:   PetscBool      flg;

232:   PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
233:   if (flg) {
234:     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

236:     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
237:     cusp->transgen = transgen;
238:     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
239:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
240:     }
241:   }
242:   return(0);
243: }

245: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
246: {
247:   PetscErrorCode           ierr;
248:   MatCUSPARSEStorageFormat format;
249:   PetscBool                flg;
250:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

253:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
254:   if (A->factortype == MAT_FACTOR_NONE) {
255:     PetscBool transgen = cusparsestruct->transgen;

257:     PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
258:     if (flg) {MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);}

260:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
261:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
262:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

264:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
265:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
266:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
267:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
268:     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
269:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
270:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
271:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
272:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

274:     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
275:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
276:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
277:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

279:     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
280:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
281:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
282:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
283:    #endif
284:   }
285:   PetscOptionsTail();
286:   return(0);
287: }

289: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
290: {

294:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
295:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
296:   return(0);
297: }

299: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300: {

304:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
305:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
306:   return(0);
307: }

309: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
310: {

314:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
315:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
316:   return(0);
317: }

319: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320: {

324:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
325:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326:   return(0);
327: }

329: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
330: {
331:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
332:   PetscInt                          n = A->rmap->n;
333:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
335:   cusparseStatus_t                  stat;
336:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
337:   const MatScalar                   *aa = a->a,*v;
338:   PetscInt                          *AiLo, *AjLo;
339:   PetscScalar                       *AALo;
340:   PetscInt                          i,nz, nzLower, offset, rowOffset;
341:   PetscErrorCode                    ierr;
342:   cudaError_t                       cerr;

345:   if (!n) return(0);
346:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
347:     try {
348:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
349:       nzLower=n+ai[n]-ai[1];

351:       /* Allocate Space for the lower triangular matrix */
352:       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
353:       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
354:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

356:       /* Fill the lower triangular matrix */
357:       AiLo[0]  = (PetscInt) 0;
358:       AiLo[n]  = nzLower;
359:       AjLo[0]  = (PetscInt) 0;
360:       AALo[0]  = (MatScalar) 1.0;
361:       v        = aa;
362:       vi       = aj;
363:       offset   = 1;
364:       rowOffset= 1;
365:       for (i=1; i<n; i++) {
366:         nz = ai[i+1] - ai[i];
367:         /* additional 1 for the term on the diagonal */
368:         AiLo[i]    = rowOffset;
369:         rowOffset += nz+1;

371:         PetscArraycpy(&(AjLo[offset]), vi, nz);
372:         PetscArraycpy(&(AALo[offset]), v, nz);

374:         offset      += nz;
375:         AjLo[offset] = (PetscInt) i;
376:         AALo[offset] = (MatScalar) 1.0;
377:         offset      += 1;

379:         v  += nz;
380:         vi += nz;
381:       }

383:       /* allocate space for the triangular factor information */
384:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

386:       /* Create the matrix description */
387:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
388:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
389:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
390:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
391:      #else
392:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
393:      #endif
394:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
395:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

397:       /* set the operation */
398:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

400:       /* set the matrix */
401:       loTriFactor->csrMat = new CsrMatrix;
402:       loTriFactor->csrMat->num_rows = n;
403:       loTriFactor->csrMat->num_cols = n;
404:       loTriFactor->csrMat->num_entries = nzLower;

406:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

409:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

412:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

415:       /* Create the solve analysis information */
416:       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
417:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
418:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
419:                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
420:                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
421:                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
422:                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
423:       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
424:     #endif

426:       /* perform the solve analysis */
427:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
428:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
429:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
430:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
431:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
432:                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
433:                              #endif
434: );CHKERRCUSPARSE(stat);

436:       /* assign the pointer. Is this really necessary? */
437:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

439:       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
440:       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
441:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
442:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
443:     } catch(char *ex) {
444:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
445:     }
446:   }
447:   return(0);
448: }

450: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
451: {
452:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
453:   PetscInt                          n = A->rmap->n;
454:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
455:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
456:   cusparseStatus_t                  stat;
457:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
458:   const MatScalar                   *aa = a->a,*v;
459:   PetscInt                          *AiUp, *AjUp;
460:   PetscScalar                       *AAUp;
461:   PetscInt                          i,nz, nzUpper, offset;
462:   PetscErrorCode                    ierr;
463:   cudaError_t                       cerr;

466:   if (!n) return(0);
467:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
468:     try {
469:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
470:       nzUpper = adiag[0]-adiag[n];

472:       /* Allocate Space for the upper triangular matrix */
473:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
474:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
475:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

477:       /* Fill the upper triangular matrix */
478:       AiUp[0]=(PetscInt) 0;
479:       AiUp[n]=nzUpper;
480:       offset = nzUpper;
481:       for (i=n-1; i>=0; i--) {
482:         v  = aa + adiag[i+1] + 1;
483:         vi = aj + adiag[i+1] + 1;

485:         /* number of elements NOT on the diagonal */
486:         nz = adiag[i] - adiag[i+1]-1;

488:         /* decrement the offset */
489:         offset -= (nz+1);

491:         /* first, set the diagonal elements */
492:         AjUp[offset] = (PetscInt) i;
493:         AAUp[offset] = (MatScalar)1./v[nz];
494:         AiUp[i]      = AiUp[i+1] - (nz+1);

496:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
497:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
498:       }

500:       /* allocate space for the triangular factor information */
501:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

503:       /* Create the matrix description */
504:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
505:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
506:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
507:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
508:      #else
509:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
510:      #endif
511:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
512:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

514:       /* set the operation */
515:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

517:       /* set the matrix */
518:       upTriFactor->csrMat = new CsrMatrix;
519:       upTriFactor->csrMat->num_rows = n;
520:       upTriFactor->csrMat->num_cols = n;
521:       upTriFactor->csrMat->num_entries = nzUpper;

523:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
524:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

526:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
527:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

529:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
530:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

532:       /* Create the solve analysis information */
533:       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
534:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
535:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
536:                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
537:                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
538:                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
539:                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
540:       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
541:     #endif

543:       /* perform the solve analysis */
544:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
545:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
546:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
547:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
548:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
549:                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
550:                              #endif
551: );CHKERRCUSPARSE(stat);

553:       /* assign the pointer. Is this really necessary? */
554:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

556:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
557:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
558:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
559:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
560:     } catch(char *ex) {
561:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
562:     }
563:   }
564:   return(0);
565: }

567: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
568: {
569:   PetscErrorCode               ierr;
570:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
571:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
572:   IS                           isrow = a->row,iscol = a->icol;
573:   PetscBool                    row_identity,col_identity;
574:   const PetscInt               *r,*c;
575:   PetscInt                     n = A->rmap->n;

578:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
579:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
580:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

582:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
583:   cusparseTriFactors->nnz=a->nz;

585:   A->offloadmask = PETSC_OFFLOAD_BOTH;
586:   /* lower triangular indices */
587:   ISGetIndices(isrow,&r);
588:   ISIdentity(isrow,&row_identity);
589:   if (!row_identity) {
590:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
591:     cusparseTriFactors->rpermIndices->assign(r, r+n);
592:   }
593:   ISRestoreIndices(isrow,&r);

595:   /* upper triangular indices */
596:   ISGetIndices(iscol,&c);
597:   ISIdentity(iscol,&col_identity);
598:   if (!col_identity) {
599:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
600:     cusparseTriFactors->cpermIndices->assign(c, c+n);
601:   }

603:   if (!row_identity && !col_identity) {
604:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
605:   } else if (!row_identity) {
606:     PetscLogCpuToGpu(n*sizeof(PetscInt));
607:   } else if (!col_identity) {
608:     PetscLogCpuToGpu(n*sizeof(PetscInt));
609:   }

611:   ISRestoreIndices(iscol,&c);
612:   return(0);
613: }

615: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
616: {
617:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
618:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
619:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
620:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
621:   cusparseStatus_t                  stat;
622:   PetscErrorCode                    ierr;
623:   cudaError_t                       cerr;
624:   PetscInt                          *AiUp, *AjUp;
625:   PetscScalar                       *AAUp;
626:   PetscScalar                       *AALo;
627:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
628:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
629:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
630:   const MatScalar                   *aa = b->a,*v;

633:   if (!n) return(0);
634:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
635:     try {
636:       /* Allocate Space for the upper triangular matrix */
637:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
638:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
639:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
640:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

642:       /* Fill the upper triangular matrix */
643:       AiUp[0]=(PetscInt) 0;
644:       AiUp[n]=nzUpper;
645:       offset = 0;
646:       for (i=0; i<n; i++) {
647:         /* set the pointers */
648:         v  = aa + ai[i];
649:         vj = aj + ai[i];
650:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

652:         /* first, set the diagonal elements */
653:         AjUp[offset] = (PetscInt) i;
654:         AAUp[offset] = (MatScalar)1.0/v[nz];
655:         AiUp[i]      = offset;
656:         AALo[offset] = (MatScalar)1.0/v[nz];

658:         offset+=1;
659:         if (nz>0) {
660:           PetscArraycpy(&(AjUp[offset]), vj, nz);
661:           PetscArraycpy(&(AAUp[offset]), v, nz);
662:           for (j=offset; j<offset+nz; j++) {
663:             AAUp[j] = -AAUp[j];
664:             AALo[j] = AAUp[j]/v[nz];
665:           }
666:           offset+=nz;
667:         }
668:       }

670:       /* allocate space for the triangular factor information */
671:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

673:       /* Create the matrix description */
674:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
675:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
676:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
677:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
678:      #else
679:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
680:      #endif
681:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
682:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

684:       /* set the matrix */
685:       upTriFactor->csrMat = new CsrMatrix;
686:       upTriFactor->csrMat->num_rows = A->rmap->n;
687:       upTriFactor->csrMat->num_cols = A->cmap->n;
688:       upTriFactor->csrMat->num_entries = a->nz;

690:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
691:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

693:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
694:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

696:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
697:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

699:       /* set the operation */
700:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

702:       /* Create the solve analysis information */
703:       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
704:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
705:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
706:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
707:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
708:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
709:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
710:       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
711:     #endif

713:       /* perform the solve analysis */
714:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
715:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
716:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
717:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
718:                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
719:                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
720:                               #endif
721: );CHKERRCUSPARSE(stat);

723:       /* assign the pointer. Is this really necessary? */
724:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

726:       /* allocate space for the triangular factor information */
727:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

729:       /* Create the matrix description */
730:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
731:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
733:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734:      #else
735:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736:      #endif
737:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

740:       /* set the operation */
741:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

743:       /* set the matrix */
744:       loTriFactor->csrMat = new CsrMatrix;
745:       loTriFactor->csrMat->num_rows = A->rmap->n;
746:       loTriFactor->csrMat->num_cols = A->cmap->n;
747:       loTriFactor->csrMat->num_entries = a->nz;

749:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
750:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

752:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
753:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

755:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
756:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
757:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

759:       /* Create the solve analysis information */
760:       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
763:                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
764:                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
765:                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
766:                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767:       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768:     #endif

770:       /* perform the solve analysis */
771:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
772:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
773:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
774:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
775:                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
776:                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
777:                               #endif
778: );CHKERRCUSPARSE(stat);

780:       /* assign the pointer. Is this really necessary? */
781:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

783:       A->offloadmask = PETSC_OFFLOAD_BOTH;
784:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
785:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
786:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
787:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
788:     } catch(char *ex) {
789:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
790:     }
791:   }
792:   return(0);
793: }

795: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
796: {
797:   PetscErrorCode               ierr;
798:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
799:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
800:   IS                           ip = a->row;
801:   const PetscInt               *rip;
802:   PetscBool                    perm_identity;
803:   PetscInt                     n = A->rmap->n;

806:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
807:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
808:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
809:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

811:   /* lower triangular indices */
812:   ISGetIndices(ip,&rip);
813:   ISIdentity(ip,&perm_identity);
814:   if (!perm_identity) {
815:     IS             iip;
816:     const PetscInt *irip;

818:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
819:     ISGetIndices(iip,&irip);
820:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
821:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
822:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
823:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
824:     ISRestoreIndices(iip,&irip);
825:     ISDestroy(&iip);
826:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
827:   }
828:   ISRestoreIndices(ip,&rip);
829:   return(0);
830: }

832: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
833: {
834:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
835:   IS             isrow = b->row,iscol = b->col;
836:   PetscBool      row_identity,col_identity;

840:   MatLUFactorNumeric_SeqAIJ(B,A,info);
841:   B->offloadmask = PETSC_OFFLOAD_CPU;
842:   /* determine which version of MatSolve needs to be used. */
843:   ISIdentity(isrow,&row_identity);
844:   ISIdentity(iscol,&col_identity);
845:   if (row_identity && col_identity) {
846:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
847:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
848:     B->ops->matsolve = NULL;
849:     B->ops->matsolvetranspose = NULL;
850:   } else {
851:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
852:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
853:     B->ops->matsolve = NULL;
854:     B->ops->matsolvetranspose = NULL;
855:   }

857:   /* get the triangular factors */
858:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
859:   return(0);
860: }

862: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
863: {
864:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
865:   IS             ip = b->row;
866:   PetscBool      perm_identity;

870:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
871:   B->offloadmask = PETSC_OFFLOAD_CPU;
872:   /* determine which version of MatSolve needs to be used. */
873:   ISIdentity(ip,&perm_identity);
874:   if (perm_identity) {
875:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
876:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
877:     B->ops->matsolve = NULL;
878:     B->ops->matsolvetranspose = NULL;
879:   } else {
880:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
881:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
882:     B->ops->matsolve = NULL;
883:     B->ops->matsolvetranspose = NULL;
884:   }

886:   /* get the triangular factors */
887:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
888:   return(0);
889: }

891: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
892: {
893:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
894:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
895:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
896:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
897:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
898:   cusparseStatus_t                  stat;
899:   cusparseIndexBase_t               indexBase;
900:   cusparseMatrixType_t              matrixType;
901:   cusparseFillMode_t                fillMode;
902:   cusparseDiagType_t                diagType;


906:   /*********************************************/
907:   /* Now the Transpose of the Lower Tri Factor */
908:   /*********************************************/

910:   /* allocate space for the transpose of the lower triangular factor */
911:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

913:   /* set the matrix descriptors of the lower triangular factor */
914:   matrixType = cusparseGetMatType(loTriFactor->descr);
915:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
916:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
917:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

920:   /* Create the matrix description */
921:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
922:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
923:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
924:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
925:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

927:   /* set the operation */
928:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

930:   /* allocate GPU space for the CSC of the lower triangular factor*/
931:   loTriFactorT->csrMat = new CsrMatrix;
932:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
933:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
934:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
935:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
936:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
937:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

939:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
940: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
941:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
942:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
943:                                        loTriFactor->csrMat->values->data().get(),
944:                                        loTriFactor->csrMat->row_offsets->data().get(),
945:                                        loTriFactor->csrMat->column_indices->data().get(),
946:                                        loTriFactorT->csrMat->values->data().get(),
947:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
948:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
949:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
950:   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
951: #endif

953:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
954:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
955:                           loTriFactor->csrMat->values->data().get(),
956:                           loTriFactor->csrMat->row_offsets->data().get(),
957:                           loTriFactor->csrMat->column_indices->data().get(),
958:                           loTriFactorT->csrMat->values->data().get(),
959:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
960:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
961:                           CUSPARSE_ACTION_NUMERIC, indexBase,
962:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
963:                         #else
964:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
965:                           CUSPARSE_ACTION_NUMERIC, indexBase
966:                         #endif
967: );CHKERRCUSPARSE(stat);

969:   /* Create the solve analysis information */
970:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
971: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
972:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
973:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
974:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
975:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
976:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
977:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
978: #endif

980:   /* perform the solve analysis */
981:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
982:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
983:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
984:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
985:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
986:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
987:                           #endif
988: );CHKERRCUSPARSE(stat);

990:   /* assign the pointer. Is this really necessary? */
991:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

993:   /*********************************************/
994:   /* Now the Transpose of the Upper Tri Factor */
995:   /*********************************************/

997:   /* allocate space for the transpose of the upper triangular factor */
998:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

1000:   /* set the matrix descriptors of the upper triangular factor */
1001:   matrixType = cusparseGetMatType(upTriFactor->descr);
1002:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1003:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1007:   /* Create the matrix description */
1008:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1009:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1010:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1011:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1012:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1014:   /* set the operation */
1015:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1017:   /* allocate GPU space for the CSC of the upper triangular factor*/
1018:   upTriFactorT->csrMat = new CsrMatrix;
1019:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1020:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1021:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1022:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1023:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1024:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1026:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1027: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1028:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1029:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1030:                                 upTriFactor->csrMat->values->data().get(),
1031:                                 upTriFactor->csrMat->row_offsets->data().get(),
1032:                                 upTriFactor->csrMat->column_indices->data().get(),
1033:                                 upTriFactorT->csrMat->values->data().get(),
1034:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1036:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1037:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1038: #endif

1040:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1041:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1042:                           upTriFactor->csrMat->values->data().get(),
1043:                           upTriFactor->csrMat->row_offsets->data().get(),
1044:                           upTriFactor->csrMat->column_indices->data().get(),
1045:                           upTriFactorT->csrMat->values->data().get(),
1046:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1049:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1050:                         #else
1051:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1052:                           CUSPARSE_ACTION_NUMERIC, indexBase
1053:                         #endif
1054: );CHKERRCUSPARSE(stat);

1056:   /* Create the solve analysis information */
1057:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1058:   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1059:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1060:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1061:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1062:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1063:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1064:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1065:   #endif

1067:   /* perform the solve analysis */
1068:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1070:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1071:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1072:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1073:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1074:                           #endif
1075: );CHKERRCUSPARSE(stat);

1077:   /* assign the pointer. Is this really necessary? */
1078:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079:   return(0);
1080: }

1082: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1083: {
1084:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1085:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1086:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1087:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1088:   cusparseStatus_t             stat;
1089:   cusparseIndexBase_t          indexBase;
1090:   cudaError_t                  err;
1091:   PetscErrorCode               ierr;

1094:   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
1095:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1096:   PetscLogGpuTimeBegin();
1097:   /* create cusparse matrix */
1098:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1099:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1100:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1101:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1102:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1104:   /* set alpha and beta */
1105:   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1106:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1107:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1108:   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1109:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1110:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1111:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1113:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1114:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1115:     CsrMatrix *matrixT= new CsrMatrix;
1116:     matrixT->num_rows = A->cmap->n;
1117:     matrixT->num_cols = A->rmap->n;
1118:     matrixT->num_entries = a->nz;
1119:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1120:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1121:     matrixT->values = new THRUSTARRAY(a->nz);

1123:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1124:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1126:     /* compute the transpose, i.e. the CSC */
1127:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1128:     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1129:                                   A->cmap->n, matrix->num_entries,
1130:                                   matrix->values->data().get(),
1131:                                   cusparsestruct->rowoffsets_gpu->data().get(),
1132:                                   matrix->column_indices->data().get(),
1133:                                   matrixT->values->data().get(),
1134:                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1135:                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1136:                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1137:     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1138:    #endif

1140:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1141:                             A->cmap->n, matrix->num_entries,
1142:                             matrix->values->data().get(),
1143:                             cusparsestruct->rowoffsets_gpu->data().get(),
1144:                             matrix->column_indices->data().get(),
1145:                             matrixT->values->data().get(),
1146:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1148:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1149:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1150:                           #else
1151:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1152:                             CUSPARSE_ACTION_NUMERIC, indexBase
1153:                           #endif
1154: );CHKERRCUSPARSE(stat);
1155:     matstructT->mat = matrixT;

1157:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1158:     stat = cusparseCreateCsr(&matstructT->matDescr,
1159:                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1160:                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1161:                              matrixT->values->data().get(),
1162:                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1163:                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1164:    #endif
1165:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1166:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1168:    #else
1169:     CsrMatrix *temp  = new CsrMatrix;
1170:     CsrMatrix *tempT = new CsrMatrix;
1171:     /* First convert HYB to CSR */
1172:     temp->num_rows = A->rmap->n;
1173:     temp->num_cols = A->cmap->n;
1174:     temp->num_entries = a->nz;
1175:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1176:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1177:     temp->values = new THRUSTARRAY(a->nz);

1179:     stat = cusparse_hyb2csr(cusparsestruct->handle,
1180:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1181:                             temp->values->data().get(),
1182:                             temp->row_offsets->data().get(),
1183:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1185:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1186:     tempT->num_rows = A->rmap->n;
1187:     tempT->num_cols = A->cmap->n;
1188:     tempT->num_entries = a->nz;
1189:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1190:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1191:     tempT->values = new THRUSTARRAY(a->nz);

1193:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1194:                             temp->num_cols, temp->num_entries,
1195:                             temp->values->data().get(),
1196:                             temp->row_offsets->data().get(),
1197:                             temp->column_indices->data().get(),
1198:                             tempT->values->data().get(),
1199:                             tempT->column_indices->data().get(),
1200:                             tempT->row_offsets->data().get(),
1201:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1203:     /* Last, convert CSC to HYB */
1204:     cusparseHybMat_t hybMat;
1205:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1206:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1207:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1208:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1209:                             matstructT->descr, tempT->values->data().get(),
1210:                             tempT->row_offsets->data().get(),
1211:                             tempT->column_indices->data().get(),
1212:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

1214:     /* assign the pointer */
1215:     matstructT->mat = hybMat;
1216:     /* delete temporaries */
1217:     if (tempT) {
1218:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1219:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1220:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1221:       delete (CsrMatrix*) tempT;
1222:     }
1223:     if (temp) {
1224:       if (temp->values) delete (THRUSTARRAY*) temp->values;
1225:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1226:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1227:       delete (CsrMatrix*) temp;
1228:     }
1229:    #endif
1230:   }
1231:   err  = WaitForCUDA();CHKERRCUDA(err);
1232:   PetscLogGpuTimeEnd();
1233:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1234:   /* the compressed row indices is not used for matTranspose */
1235:   matstructT->cprowIndices = NULL;
1236:   /* assign the pointer */
1237:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1238:   return(0);
1239: }

1241: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1242: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1243: {
1244:   PetscInt                              n = xx->map->n;
1245:   const PetscScalar                     *barray;
1246:   PetscScalar                           *xarray;
1247:   thrust::device_ptr<const PetscScalar> bGPU;
1248:   thrust::device_ptr<PetscScalar>       xGPU;
1249:   cusparseStatus_t                      stat;
1250:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1251:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1252:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1253:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1254:   PetscErrorCode                        ierr;
1255:   cudaError_t                           cerr;

1258:   /* Analyze the matrix and create the transpose ... on the fly */
1259:   if (!loTriFactorT && !upTriFactorT) {
1260:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1261:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1262:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1263:   }

1265:   /* Get the GPU pointers */
1266:   VecCUDAGetArrayWrite(xx,&xarray);
1267:   VecCUDAGetArrayRead(bb,&barray);
1268:   xGPU = thrust::device_pointer_cast(xarray);
1269:   bGPU = thrust::device_pointer_cast(barray);

1271:   PetscLogGpuTimeBegin();
1272:   /* First, reorder with the row permutation */
1273:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1274:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1275:                xGPU);

1277:   /* First, solve U */
1278:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1279:                         upTriFactorT->csrMat->num_rows,
1280:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1281:                         upTriFactorT->csrMat->num_entries,
1282:                       #endif
1283:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1284:                         upTriFactorT->csrMat->values->data().get(),
1285:                         upTriFactorT->csrMat->row_offsets->data().get(),
1286:                         upTriFactorT->csrMat->column_indices->data().get(),
1287:                         upTriFactorT->solveInfo,
1288:                         xarray, tempGPU->data().get()
1289:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1291:                       #endif
1292: );CHKERRCUSPARSE(stat);

1294:   /* Then, solve L */
1295:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1296:                         loTriFactorT->csrMat->num_rows,
1297:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1298:                         loTriFactorT->csrMat->num_entries,
1299:                       #endif
1300:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1301:                         loTriFactorT->csrMat->values->data().get(),
1302:                         loTriFactorT->csrMat->row_offsets->data().get(),
1303:                         loTriFactorT->csrMat->column_indices->data().get(),
1304:                         loTriFactorT->solveInfo,
1305:                         tempGPU->data().get(), xarray
1306:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1307:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1308:                       #endif
1309: );CHKERRCUSPARSE(stat);

1311:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1312:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1313:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1314:                tempGPU->begin());

1316:   /* Copy the temporary to the full solution. */
1317:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1319:   /* restore */
1320:   VecCUDARestoreArrayRead(bb,&barray);
1321:   VecCUDARestoreArrayWrite(xx,&xarray);
1322:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1323:   PetscLogGpuTimeEnd();
1324:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1325:   return(0);
1326: }

1328: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1329: {
1330:   const PetscScalar                 *barray;
1331:   PetscScalar                       *xarray;
1332:   cusparseStatus_t                  stat;
1333:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1334:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1335:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1336:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1337:   PetscErrorCode                    ierr;
1338:   cudaError_t                       cerr;

1341:   /* Analyze the matrix and create the transpose ... on the fly */
1342:   if (!loTriFactorT && !upTriFactorT) {
1343:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1344:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1345:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1346:   }

1348:   /* Get the GPU pointers */
1349:   VecCUDAGetArrayWrite(xx,&xarray);
1350:   VecCUDAGetArrayRead(bb,&barray);

1352:   PetscLogGpuTimeBegin();
1353:   /* First, solve U */
1354:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355:                         upTriFactorT->csrMat->num_rows,
1356:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357:                         upTriFactorT->csrMat->num_entries,
1358:                       #endif
1359:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360:                         upTriFactorT->csrMat->values->data().get(),
1361:                         upTriFactorT->csrMat->row_offsets->data().get(),
1362:                         upTriFactorT->csrMat->column_indices->data().get(),
1363:                         upTriFactorT->solveInfo,
1364:                         barray, tempGPU->data().get()
1365:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1366:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1367:                       #endif
1368: );CHKERRCUSPARSE(stat);

1370:   /* Then, solve L */
1371:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1372:                         loTriFactorT->csrMat->num_rows,
1373:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1374:                         loTriFactorT->csrMat->num_entries,
1375:                       #endif
1376:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1377:                         loTriFactorT->csrMat->values->data().get(),
1378:                         loTriFactorT->csrMat->row_offsets->data().get(),
1379:                         loTriFactorT->csrMat->column_indices->data().get(),
1380:                         loTriFactorT->solveInfo,
1381:                         tempGPU->data().get(), xarray
1382:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1384:                       #endif
1385: );CHKERRCUSPARSE(stat);

1387:   /* restore */
1388:   VecCUDARestoreArrayRead(bb,&barray);
1389:   VecCUDARestoreArrayWrite(xx,&xarray);
1390:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1391:   PetscLogGpuTimeEnd();
1392:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1393:   return(0);
1394: }

1396: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397: {
1398:   const PetscScalar                     *barray;
1399:   PetscScalar                           *xarray;
1400:   thrust::device_ptr<const PetscScalar> bGPU;
1401:   thrust::device_ptr<PetscScalar>       xGPU;
1402:   cusparseStatus_t                      stat;
1403:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1404:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1405:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1406:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1407:   PetscErrorCode                        ierr;
1408:   cudaError_t                           cerr;


1412:   /* Get the GPU pointers */
1413:   VecCUDAGetArrayWrite(xx,&xarray);
1414:   VecCUDAGetArrayRead(bb,&barray);
1415:   xGPU = thrust::device_pointer_cast(xarray);
1416:   bGPU = thrust::device_pointer_cast(barray);

1418:   PetscLogGpuTimeBegin();
1419:   /* First, reorder with the row permutation */
1420:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1421:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1422:                tempGPU->begin());

1424:   /* Next, solve L */
1425:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1426:                         loTriFactor->csrMat->num_rows,
1427:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1428:                         loTriFactor->csrMat->num_entries,
1429:                       #endif
1430:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1431:                         loTriFactor->csrMat->values->data().get(),
1432:                         loTriFactor->csrMat->row_offsets->data().get(),
1433:                         loTriFactor->csrMat->column_indices->data().get(),
1434:                         loTriFactor->solveInfo,
1435:                         tempGPU->data().get(), xarray
1436:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1438:                       #endif
1439: );CHKERRCUSPARSE(stat);

1441:   /* Then, solve U */
1442:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1443:                         upTriFactor->csrMat->num_rows,
1444:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1445:                         upTriFactor->csrMat->num_entries,
1446:                       #endif
1447:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1448:                         upTriFactor->csrMat->values->data().get(),
1449:                         upTriFactor->csrMat->row_offsets->data().get(),
1450:                         upTriFactor->csrMat->column_indices->data().get(),
1451:                         upTriFactor->solveInfo,
1452:                         xarray, tempGPU->data().get()
1453:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1454:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1455:                       #endif
1456: );CHKERRCUSPARSE(stat);

1458:   /* Last, reorder with the column permutation */
1459:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1460:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1461:                xGPU);

1463:   VecCUDARestoreArrayRead(bb,&barray);
1464:   VecCUDARestoreArrayWrite(xx,&xarray);
1465:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1466:   PetscLogGpuTimeEnd();
1467:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1468:   return(0);
1469: }

1471: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1472: {
1473:   const PetscScalar                 *barray;
1474:   PetscScalar                       *xarray;
1475:   cusparseStatus_t                  stat;
1476:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1478:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1479:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480:   PetscErrorCode                    ierr;
1481:   cudaError_t                       cerr;

1484:   /* Get the GPU pointers */
1485:   VecCUDAGetArrayWrite(xx,&xarray);
1486:   VecCUDAGetArrayRead(bb,&barray);

1488:   PetscLogGpuTimeBegin();
1489:   /* First, solve L */
1490:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1491:                         loTriFactor->csrMat->num_rows,
1492:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1493:                         loTriFactor->csrMat->num_entries,
1494:                       #endif
1495:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1496:                         loTriFactor->csrMat->values->data().get(),
1497:                         loTriFactor->csrMat->row_offsets->data().get(),
1498:                         loTriFactor->csrMat->column_indices->data().get(),
1499:                         loTriFactor->solveInfo,
1500:                         barray, tempGPU->data().get()
1501:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1502:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1503:                       #endif
1504: );CHKERRCUSPARSE(stat);

1506:   /* Next, solve U */
1507:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1508:                         upTriFactor->csrMat->num_rows,
1509:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1510:                         upTriFactor->csrMat->num_entries,
1511:                       #endif
1512:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1513:                         upTriFactor->csrMat->values->data().get(),
1514:                         upTriFactor->csrMat->row_offsets->data().get(),
1515:                         upTriFactor->csrMat->column_indices->data().get(),
1516:                         upTriFactor->solveInfo,
1517:                         tempGPU->data().get(), xarray
1518:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1519:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1520:                       #endif
1521: );CHKERRCUSPARSE(stat);

1523:   VecCUDARestoreArrayRead(bb,&barray);
1524:   VecCUDARestoreArrayWrite(xx,&xarray);
1525:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1526:   PetscLogGpuTimeEnd();
1527:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1528:   return(0);
1529: }

1531: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1532: {
1533:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1534:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1535:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1536:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1537:   PetscErrorCode               ierr;
1538:   cusparseStatus_t             stat;
1539:   cudaError_t                  err;

1542:   if (A->boundtocpu) return(0);
1543:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1544:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1545:       /* Copy values only */
1546:       CsrMatrix *matrix,*matrixT;
1547:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1549:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1550:       matrix->values->assign(a->a, a->a+a->nz);
1551:       err  = WaitForCUDA();CHKERRCUDA(err);
1552:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1553:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);

1555:       /* Update matT when it was built before */
1556:       if (cusparsestruct->matTranspose) {
1557:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1558:         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1559:         PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1560:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561:                             A->cmap->n, matrix->num_entries,
1562:                             matrix->values->data().get(),
1563:                             cusparsestruct->rowoffsets_gpu->data().get(),
1564:                             matrix->column_indices->data().get(),
1565:                             matrixT->values->data().get(),
1566:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1569:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570:                           #else
1571:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572:                             CUSPARSE_ACTION_NUMERIC, indexBase
1573:                           #endif
1574: );CHKERRCUSPARSE(stat);
1575:         err  = WaitForCUDA();CHKERRCUDA(err);
1576:         PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1577:       }
1578:     } else {
1579:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1580:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1581:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1582:       delete cusparsestruct->workVector;
1583:       delete cusparsestruct->rowoffsets_gpu;
1584:       try {
1585:         if (a->compressedrow.use) {
1586:           m    = a->compressedrow.nrows;
1587:           ii   = a->compressedrow.i;
1588:           ridx = a->compressedrow.rindex;
1589:         } else {
1590:           m    = A->rmap->n;
1591:           ii   = a->i;
1592:           ridx = NULL;
1593:         }
1594:         cusparsestruct->nrows = m;

1596:         /* create cusparse matrix */
1597:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1598:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1599:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1600:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1602:         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1603:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1604:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1605:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1606:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1607:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1608:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1610:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1611:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1612:           /* set the matrix */
1613:           CsrMatrix *mat= new CsrMatrix;
1614:           mat->num_rows = m;
1615:           mat->num_cols = A->cmap->n;
1616:           mat->num_entries = a->nz;
1617:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1618:           mat->row_offsets->assign(ii, ii + m+1);

1620:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1621:           mat->column_indices->assign(a->j, a->j+a->nz);

1623:           mat->values = new THRUSTARRAY(a->nz);
1624:           mat->values->assign(a->a, a->a+a->nz);

1626:           /* assign the pointer */
1627:           matstruct->mat = mat;
1628:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1629:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1630:             stat = cusparseCreateCsr(&matstruct->matDescr,
1631:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1632:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1633:                                     mat->values->data().get(),
1634:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1635:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1636:           }
1637:          #endif
1638:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1639:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1640:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1641:          #else
1642:           CsrMatrix *mat= new CsrMatrix;
1643:           mat->num_rows = m;
1644:           mat->num_cols = A->cmap->n;
1645:           mat->num_entries = a->nz;
1646:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1647:           mat->row_offsets->assign(ii, ii + m+1);

1649:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1650:           mat->column_indices->assign(a->j, a->j+a->nz);

1652:           mat->values = new THRUSTARRAY(a->nz);
1653:           mat->values->assign(a->a, a->a+a->nz);

1655:           cusparseHybMat_t hybMat;
1656:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1657:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1658:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1659:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1660:               matstruct->descr, mat->values->data().get(),
1661:               mat->row_offsets->data().get(),
1662:               mat->column_indices->data().get(),
1663:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1664:           /* assign the pointer */
1665:           matstruct->mat = hybMat;

1667:           if (mat) {
1668:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1669:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1670:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1671:             delete (CsrMatrix*)mat;
1672:           }
1673:          #endif
1674:         }

1676:         /* assign the compressed row indices */
1677:         if (a->compressedrow.use) {
1678:           cusparsestruct->workVector = new THRUSTARRAY(m);
1679:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1680:           matstruct->cprowIndices->assign(ridx,ridx+m);
1681:           tmp = m;
1682:         } else {
1683:           cusparsestruct->workVector = NULL;
1684:           matstruct->cprowIndices    = NULL;
1685:           tmp = 0;
1686:         }
1687:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1689:         /* assign the pointer */
1690:         cusparsestruct->mat = matstruct;
1691:       } catch(char *ex) {
1692:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1693:       }
1694:       err  = WaitForCUDA();CHKERRCUDA(err);
1695:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1696:       cusparsestruct->nonzerostate = A->nonzerostate;
1697:     }
1698:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1699:   }
1700:   return(0);
1701: }

1703: struct VecCUDAPlusEquals
1704: {
1705:   template <typename Tuple>
1706:   __host__ __device__
1707:   void operator()(Tuple t)
1708:   {
1709:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1710:   }
1711: };

1713: struct VecCUDAEqualsReverse
1714: {
1715:   template <typename Tuple>
1716:   __host__ __device__
1717:   void operator()(Tuple t)
1718:   {
1719:     thrust::get<0>(t) = thrust::get<1>(t);
1720:   }
1721: };

1723: struct MatMatCusparse {
1724:   PetscBool            cisdense;
1725:   PetscScalar          *Bt;
1726:   Mat                  X;
1727: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1728:   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1729:   cusparseDnMatDescr_t matBDescr;
1730:   cusparseDnMatDescr_t matCDescr;
1731:   size_t               spmmBufferSize;
1732:   void                 *spmmBuffer;
1733:   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1734: #endif
1735: };

1737: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1738: {
1740:   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1741:   cudaError_t    cerr;

1744:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1745:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1746:   cusparseStatus_t stat;
1747:   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1748:   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1749:   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1750:  #endif
1751:   MatDestroy(&mmdata->X);
1752:   PetscFree(data);
1753:   return(0);
1754: }

1756: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1758: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1759: {
1760:   Mat_Product                  *product = C->product;
1761:   Mat                          A,B;
1762:   PetscInt                     m,n,blda,clda;
1763:   PetscBool                    flg,biscuda;
1764:   Mat_SeqAIJCUSPARSE           *cusp;
1765:   cusparseStatus_t             stat;
1766:   cusparseOperation_t          opA;
1767:   const PetscScalar            *barray;
1768:   PetscScalar                  *carray;
1769:   PetscErrorCode               ierr;
1770:   MatMatCusparse               *mmdata;
1771:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1772:   CsrMatrix                    *csrmat;
1773:   cudaError_t                  cerr;

1776:   MatCheckProduct(C,1);
1777:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1778:   mmdata = (MatMatCusparse*)product->data;
1779:   A    = product->A;
1780:   B    = product->B;
1781:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1782:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1783:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1784:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1785:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1786:   MatSeqAIJCUSPARSECopyToGPU(A);
1787:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1788:   switch (product->type) {
1789:   case MATPRODUCT_AB:
1790:   case MATPRODUCT_PtAP:
1791:     mat = cusp->mat;
1792:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1793:     m   = A->rmap->n;
1794:     n   = B->cmap->n;
1795:     break;
1796:   case MATPRODUCT_AtB:
1797:     if (!cusp->transgen) {
1798:       mat = cusp->mat;
1799:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1800:     } else {
1801:       MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1802:       mat  = cusp->matTranspose;
1803:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1804:     }
1805:     m = A->cmap->n;
1806:     n = B->cmap->n;
1807:     break;
1808:   case MATPRODUCT_ABt:
1809:   case MATPRODUCT_RARt:
1810:     mat = cusp->mat;
1811:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1812:     m   = A->rmap->n;
1813:     n   = B->rmap->n;
1814:     break;
1815:   default:
1816:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1817:   }
1818:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1819:   csrmat = (CsrMatrix*)mat->mat;
1820:   /* if the user passed a CPU matrix, copy the data to the GPU */
1821:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1822:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
1823:   MatDenseCUDAGetArrayRead(B,&barray);

1825:   MatDenseGetLDA(B,&blda);
1826:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1827:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1828:     MatDenseGetLDA(mmdata->X,&clda);
1829:   } else {
1830:     MatDenseCUDAGetArrayWrite(C,&carray);
1831:     MatDenseGetLDA(C,&clda);
1832:   }

1834:   PetscLogGpuTimeBegin();
1835:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1836:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1837:   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1838:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1839:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1840:     if (!mmdata->matBDescr) {
1841:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1842:       mmdata->Blda = blda;
1843:     }

1845:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1846:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1847:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1848:       mmdata->Clda = clda;
1849:     }

1851:     if (!mat->matDescr) {
1852:       stat = cusparseCreateCsr(&mat->matDescr,
1853:                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1854:                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1855:                               csrmat->values->data().get(),
1856:                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857:                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858:     }
1859:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1860:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1861:                                    mmdata->matCDescr,cusparse_scalartype,
1862:                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1863:     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1864:     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1865:     mmdata->initialized = PETSC_TRUE;
1866:   } else {
1867:     /* to be safe, always update pointers of the mats */
1868:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1869:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1870:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1871:   }

1873:   /* do cusparseSpMM, which supports transpose on B */
1874:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1875:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1876:                       mmdata->matCDescr,cusparse_scalartype,
1877:                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1878:  #else
1879:   PetscInt k;
1880:   /* cusparseXcsrmm does not support transpose on B */
1881:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1882:     cublasHandle_t cublasv2handle;
1883:     cublasStatus_t cerr;

1885:     PetscCUBLASGetHandle(&cublasv2handle);
1886:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1887:                        B->cmap->n,B->rmap->n,
1888:                        &PETSC_CUSPARSE_ONE ,barray,blda,
1889:                        &PETSC_CUSPARSE_ZERO,barray,blda,
1890:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1891:     blda = B->cmap->n;
1892:     k    = B->cmap->n;
1893:   } else {
1894:     k    = B->rmap->n;
1895:   }

1897:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1898:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1899:                            csrmat->num_entries,mat->alpha_one,mat->descr,
1900:                            csrmat->values->data().get(),
1901:                            csrmat->row_offsets->data().get(),
1902:                            csrmat->column_indices->data().get(),
1903:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1904:                            carray,clda);CHKERRCUSPARSE(stat);
1905:  #endif
1906:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1907:   PetscLogGpuTimeEnd();
1908:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1909:   MatDenseCUDARestoreArrayRead(B,&barray);
1910:   if (product->type == MATPRODUCT_RARt) {
1911:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1912:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1913:   } else if (product->type == MATPRODUCT_PtAP) {
1914:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1915:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1916:   } else {
1917:     MatDenseCUDARestoreArrayWrite(C,&carray);
1918:   }
1919:   if (mmdata->cisdense) {
1920:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1921:   }
1922:   if (!biscuda) {
1923:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1924:   }
1925:   return(0);
1926: }

1928: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929: {
1930:   Mat_Product        *product = C->product;
1931:   Mat                A,B;
1932:   PetscInt           m,n;
1933:   PetscBool          cisdense,flg;
1934:   PetscErrorCode     ierr;
1935:   MatMatCusparse     *mmdata;
1936:   Mat_SeqAIJCUSPARSE *cusp;

1939:   MatCheckProduct(C,1);
1940:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1941:   A    = product->A;
1942:   B    = product->B;
1943:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1944:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1945:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1946:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1947:   switch (product->type) {
1948:   case MATPRODUCT_AB:
1949:     m = A->rmap->n;
1950:     n = B->cmap->n;
1951:     break;
1952:   case MATPRODUCT_AtB:
1953:     m = A->cmap->n;
1954:     n = B->cmap->n;
1955:     break;
1956:   case MATPRODUCT_ABt:
1957:     m = A->rmap->n;
1958:     n = B->rmap->n;
1959:     break;
1960:   case MATPRODUCT_PtAP:
1961:     m = B->cmap->n;
1962:     n = B->cmap->n;
1963:     break;
1964:   case MATPRODUCT_RARt:
1965:     m = B->rmap->n;
1966:     n = B->rmap->n;
1967:     break;
1968:   default:
1969:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1970:   }
1971:   MatSetSizes(C,m,n,m,n);
1972:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1973:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1974:   MatSetType(C,MATSEQDENSECUDA);

1976:   /* product data */
1977:   PetscNew(&mmdata);
1978:   mmdata->cisdense = cisdense;
1979:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1980:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1981:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1982:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1983:   }
1984:  #endif
1985:   /* for these products we need intermediate storage */
1986:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1987:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1988:     MatSetType(mmdata->X,MATSEQDENSECUDA);
1989:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1990:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1991:     } else {
1992:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1993:     }
1994:   }
1995:   C->product->data    = mmdata;
1996:   C->product->destroy = MatDestroy_MatMatCusparse;

1998:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1999:   return(0);
2000: }

2002: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2004: /* handles dense B */
2005: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2006: {
2007:   Mat_Product    *product = C->product;

2011:   MatCheckProduct(C,1);
2012:   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2013:   if (product->A->boundtocpu) {
2014:     MatProductSetFromOptions_SeqAIJ_SeqDense(C);
2015:     return(0);
2016:   }
2017:   switch (product->type) {
2018:   case MATPRODUCT_AB:
2019:   case MATPRODUCT_AtB:
2020:   case MATPRODUCT_ABt:
2021:   case MATPRODUCT_PtAP:
2022:   case MATPRODUCT_RARt:
2023:     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2024:   default:
2025:     break;
2026:   }
2027:   return(0);
2028: }

2030: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2031: {

2035:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2036:   return(0);
2037: }

2039: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2040: {

2044:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2045:   return(0);
2046: }

2048: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2049: {

2053:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2054:   return(0);
2055: }

2057: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2058: {

2062:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2063:   return(0);
2064: }

2066: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2067: {

2071:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2072:   return(0);
2073: }

2075: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2076: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2077: {
2078:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2079:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2081:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2082:   PetscErrorCode               ierr;
2083:   cudaError_t                  cerr;
2084:   cusparseStatus_t             stat;
2085:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086:   PetscBool                    compressed;
2087: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088:   PetscInt                     nx,ny;
2089: #endif

2092:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2093:   if (!a->nonzerorowcnt) {
2094:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2095:     return(0);
2096:   }
2097:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2098:   MatSeqAIJCUSPARSECopyToGPU(A);
2099:   if (!trans) {
2100:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2101:   } else {
2102:     if (herm || !cusparsestruct->transgen) {
2103:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2104:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2105:     } else {
2106:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEGenerateTransposeForMult(A);}
2107:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2108:     }
2109:   }
2110:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2111:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2113:   try {
2114:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2115:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2116:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2118:     PetscLogGpuTimeBegin();
2119:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2120:       /* z = A x + beta y.
2121:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2122:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2123:       */
2124:       xptr = xarray;
2125:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2126:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2127:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2129:           allocated to accommodate different uses. So we get the length info directly from mat.
2130:        */
2131:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2132:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2133:         nx = mat->num_cols;
2134:         ny = mat->num_rows;
2135:       }
2136:      #endif
2137:     } else {
2138:       /* z = A^T x + beta y
2139:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2140:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2141:        */
2142:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2143:       dptr = zarray;
2144:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2145:       if (compressed) { /* Scatter x to work vector */
2146:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2147:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2148:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2149:                          VecCUDAEqualsReverse());
2150:       }
2151:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2152:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2153:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2154:         nx = mat->num_rows;
2155:         ny = mat->num_cols;
2156:       }
2157:      #endif
2158:     }

2160:     /* csr_spmv does y = alpha op(A) x + beta y */
2161:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2162:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2163:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2164:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2165:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2166:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2167:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2168:                                 matstruct->matDescr,
2169:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2170:                                 matstruct->cuSpMV[opA].vecYDescr,
2171:                                 cusparse_scalartype,
2172:                                 cusparsestruct->spmvAlg,
2173:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2174:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2176:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2177:       } else {
2178:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2179:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2180:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2181:       }

2183:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2184:                                matstruct->alpha_one,
2185:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2186:                                matstruct->cuSpMV[opA].vecXDescr,
2187:                                beta,
2188:                                matstruct->cuSpMV[opA].vecYDescr,
2189:                                cusparse_scalartype,
2190:                                cusparsestruct->spmvAlg,
2191:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2192:      #else
2193:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2194:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2195:                                mat->num_rows, mat->num_cols,
2196:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2197:                                mat->values->data().get(), mat->row_offsets->data().get(),
2198:                                mat->column_indices->data().get(), xptr, beta,
2199:                                dptr);CHKERRCUSPARSE(stat);
2200:      #endif
2201:     } else {
2202:       if (cusparsestruct->nrows) {
2203:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2204:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2205:        #else
2206:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2207:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2208:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2209:                                  xptr, beta,
2210:                                  dptr);CHKERRCUSPARSE(stat);
2211:        #endif
2212:       }
2213:     }
2214:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2215:     PetscLogGpuTimeEnd();

2217:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2218:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2219:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2220:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2221:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2222:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2223:         }
2224:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2225:         VecSet_SeqCUDA(zz,0);
2226:       }

2228:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2229:       if (compressed) {
2230:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);

2232:         PetscLogGpuTimeBegin();
2233:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2234:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2235:                          VecCUDAPlusEquals());
2236:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2237:         PetscLogGpuTimeEnd();
2238:       }
2239:     } else {
2240:       if (yy && yy != zz) {
2241:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2242:       }
2243:     }
2244:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2245:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2246:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
2247:   } catch(char *ex) {
2248:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2249:   }
2250:   if (yy) {
2251:     PetscLogGpuFlops(2.0*a->nz);
2252:   } else {
2253:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2254:   }
2255:   return(0);
2256: }

2258: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2259: {

2263:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2264:   return(0);
2265: }

2267: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2268: {
2269:   PetscErrorCode              ierr;
2270:   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2271:   PetscBool                   is_seq = PETSC_TRUE;
2272:   PetscInt                    nnz_state = A->nonzerostate;
2274:   if (A->factortype == MAT_FACTOR_NONE) {
2275:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2276:   }
2277:   if (d_mat) {
2278:     cudaError_t err;
2279:     PetscInfo(A,"Assemble device matrix\n");
2280:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2281:     nnz_state = h_mat.nonzerostate;
2282:     is_seq = h_mat.seq;
2283:   }
2284:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2285:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2286:   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2287:     MatSeqAIJCUSPARSECopyToGPU(A);
2288:   } else if (nnz_state > A->nonzerostate) {
2289:     A->offloadmask = PETSC_OFFLOAD_GPU;
2290:   }

2292:   return(0);
2293: }

2295: /* --------------------------------------------------------------------------------*/
2296: /*@
2297:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2298:    (the default parallel PETSc format). This matrix will ultimately pushed down
2299:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2300:    assembly performance the user should preallocate the matrix storage by setting
2301:    the parameter nz (or the array nnz).  By setting these parameters accurately,
2302:    performance during matrix assembly can be increased by more than a factor of 50.

2304:    Collective

2306:    Input Parameters:
2307: +  comm - MPI communicator, set to PETSC_COMM_SELF
2308: .  m - number of rows
2309: .  n - number of columns
2310: .  nz - number of nonzeros per row (same for all rows)
2311: -  nnz - array containing the number of nonzeros in the various rows
2312:          (possibly different for each row) or NULL

2314:    Output Parameter:
2315: .  A - the matrix

2317:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2318:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2319:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

2321:    Notes:
2322:    If nnz is given then nz is ignored

2324:    The AIJ format (also called the Yale sparse matrix format or
2325:    compressed row storage), is fully compatible with standard Fortran 77
2326:    storage.  That is, the stored row and column indices can begin at
2327:    either one (as in Fortran) or zero.  See the users' manual for details.

2329:    Specify the preallocated storage with either nz or nnz (not both).
2330:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2331:    allocation.  For large problems you MUST preallocate memory or you
2332:    will get TERRIBLE performance, see the users' manual chapter on matrices.

2334:    By default, this format uses inodes (identical nodes) when possible, to
2335:    improve numerical efficiency of matrix-vector products and solves. We
2336:    search for consecutive rows with the same nonzero structure, thereby
2337:    reusing matrix information to achieve increased efficiency.

2339:    Level: intermediate

2341: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2342: @*/
2343: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2344: {

2348:   MatCreate(comm,A);
2349:   MatSetSizes(*A,m,n,m,n);
2350:   MatSetType(*A,MATSEQAIJCUSPARSE);
2351:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
2352:   return(0);
2353: }

2355: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2356: {
2357:   PetscErrorCode              ierr;
2358:   PetscSplitCSRDataStructure  *d_mat = NULL;

2361:   if (A->factortype == MAT_FACTOR_NONE) {
2362:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2363:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2364:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
2365:   } else {
2366:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
2367:   }
2368:   if (d_mat) {
2369:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2370:     cudaError_t                err;
2371:     PetscSplitCSRDataStructure h_mat;
2372:     PetscInfo(A,"Have device matrix\n");
2373:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2374:     if (h_mat.seq) {
2375:       if (a->compressedrow.use) {
2376:         err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2377:       }
2378:       err = cudaFree(d_mat);CHKERRCUDA(err);
2379:     }
2380:   }
2381:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
2382:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2383:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2384:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
2385:   MatDestroy_SeqAIJ(A);
2386:   return(0);
2387: }

2389: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2390: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2391: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2392: {

2396:   MatDuplicate_SeqAIJ(A,cpvalues,B);
2397:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
2398:   return(0);
2399: }

2401: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2402: {
2403:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

2407:   if (A->factortype != MAT_FACTOR_NONE) return(0);
2408:   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2409:      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2410:      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2411:      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2412:            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2413:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2414:   /* TODO: add support for this? */
2415:   if (flg) {
2416:     A->ops->mult                      = MatMult_SeqAIJ;
2417:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2418:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2419:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2420:     A->ops->multhermitiantranspose    = NULL;
2421:     A->ops->multhermitiantransposeadd = NULL;
2422:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2423:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2424:   } else {
2425:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2426:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2427:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2428:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2429:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2430:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2431:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2432:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2433:   }
2434:   A->boundtocpu = flg;
2435:   a->inode.use = flg;
2436:   return(0);
2437: }

2439: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2440: {
2441:   PetscSplitCSRDataStructure  *d_mat = NULL;
2442:   PetscErrorCode               ierr;
2444:   if (A->factortype == MAT_FACTOR_NONE) {
2445:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2446:     d_mat = spptr->deviceMat;
2447:   }
2448:   if (d_mat) {
2449:     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2450:     PetscInt     n = A->rmap->n, nnz = a->i[n];
2451:     cudaError_t  err;
2452:     PetscScalar  *vals;
2453:     PetscInfo(A,"Zero device matrix\n");
2454:     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2455:     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2456:   }
2457:   MatZeroEntries_SeqAIJ(A);

2459:   return(0);
2460: }

2462: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2463: {
2464:   PetscErrorCode   ierr;
2465:   cusparseStatus_t stat;
2466:   Mat              B;

2469:   PetscCUDAInitializeCheck();
2470:   if (reuse == MAT_INITIAL_MATRIX) {
2471:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
2472:   } else if (reuse == MAT_REUSE_MATRIX) {
2473:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2474:   }
2475:   B = *newmat;

2477:   PetscFree(B->defaultvectype);
2478:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

2480:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2481:     if (B->factortype == MAT_FACTOR_NONE) {
2482:       Mat_SeqAIJCUSPARSE *spptr;

2484:       PetscNew(&spptr);
2485:       spptr->format = MAT_CUSPARSE_CSR;
2486:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2487:       B->spptr = spptr;
2488:       spptr->deviceMat = NULL;
2489:     } else {
2490:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2492:       PetscNew(&spptr);
2493:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2494:       B->spptr = spptr;
2495:     }
2496:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2497:   }
2498:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2499:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2500:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2501:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2502:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2503:   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;

2505:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2506:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2507:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2508:   return(0);
2509: }

2511: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2512: {

2516:   MatCreate_SeqAIJ(B);
2517:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2518:   PetscObjectOptionsBegin((PetscObject)B);
2519:   MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);
2520:   PetscOptionsEnd();
2521:   return(0);
2522: }

2524: /*MC
2525:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

2527:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2528:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2529:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

2531:    Options Database Keys:
2532: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2533: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2534: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

2536:   Level: beginner

2538: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2539: M*/

2541: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


2544: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2545: {

2549:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2550:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2551:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2552:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2553:   return(0);
2554: }

2556: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2557: {
2558:   PetscErrorCode   ierr;
2559:   cusparseStatus_t stat;
2560:   cusparseHandle_t handle;

2563:   if (*cusparsestruct) {
2564:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2565:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2566:     delete (*cusparsestruct)->workVector;
2567:     delete (*cusparsestruct)->rowoffsets_gpu;
2568:     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2569:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2570:     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2571:    #endif
2572:     PetscFree(*cusparsestruct);
2573:   }
2574:   return(0);
2575: }

2577: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2578: {
2580:   if (*mat) {
2581:     delete (*mat)->values;
2582:     delete (*mat)->column_indices;
2583:     delete (*mat)->row_offsets;
2584:     delete *mat;
2585:     *mat = 0;
2586:   }
2587:   return(0);
2588: }

2590: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2591: {
2592:   cusparseStatus_t stat;
2593:   PetscErrorCode   ierr;

2596:   if (*trifactor) {
2597:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2598:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2599:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
2600:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2601:     cudaError_t cerr;
2602:     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2603:     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2604:    #endif
2605:     delete *trifactor;
2606:     *trifactor = 0;
2607:   }
2608:   return(0);
2609: }

2611: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2612: {
2613:   CsrMatrix        *mat;
2614:   cusparseStatus_t stat;
2615:   cudaError_t      err;

2618:   if (*matstruct) {
2619:     if ((*matstruct)->mat) {
2620:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2621:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2622:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2623:        #else
2624:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2625:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2626:        #endif
2627:       } else {
2628:         mat = (CsrMatrix*)(*matstruct)->mat;
2629:         CsrMatrix_Destroy(&mat);
2630:       }
2631:     }
2632:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2633:     delete (*matstruct)->cprowIndices;
2634:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2635:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2636:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

2638:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2639:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2640:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2641:     for (int i=0; i<3; i++) {
2642:       if (mdata->cuSpMV[i].initialized) {
2643:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2644:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2645:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2646:       }
2647:     }
2648:    #endif
2649:     delete *matstruct;
2650:     *matstruct = 0;
2651:   }
2652:   return(0);
2653: }

2655: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2656: {

2660:   if (*trifactors) {
2661:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2662:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2663:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2664:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2665:     delete (*trifactors)->rpermIndices;
2666:     delete (*trifactors)->cpermIndices;
2667:     delete (*trifactors)->workVector;
2668:     (*trifactors)->rpermIndices = 0;
2669:     (*trifactors)->cpermIndices = 0;
2670:     (*trifactors)->workVector = 0;
2671:   }
2672:   return(0);
2673: }

2675: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2676: {
2677:   PetscErrorCode   ierr;
2678:   cusparseHandle_t handle;
2679:   cusparseStatus_t stat;

2682:   if (*trifactors) {
2683:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2684:     if (handle = (*trifactors)->handle) {
2685:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2686:     }
2687:     PetscFree(*trifactors);
2688:   }
2689:   return(0);
2690: }