Actual source code: aijcusparse.cu

petsc-3.12.5 2020-03-29
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10:  #include <../src/mat/impls/aij/seq/aij.h>
 11:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12:  #include <../src/vec/vec/impls/dvecimpl.h>
 13:  #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 41: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 44: {
 45:   cusparseStatus_t   stat;
 46:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 49:   cusparsestruct->stream = stream;
 50:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 51:   return(0);
 52: }

 54: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 55: {
 56:   cusparseStatus_t   stat;
 57:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 60:   if (cusparsestruct->handle != handle) {
 61:     if (cusparsestruct->handle) {
 62:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 63:     }
 64:     cusparsestruct->handle = handle;
 65:   }
 66:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 67:   return(0);
 68: }

 70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 71: {
 72:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 74:   if (cusparsestruct->handle)
 75:     cusparsestruct->handle = 0;
 76:   return(0);
 77: }

 79: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 80: {
 82:   *type = MATSOLVERCUSPARSE;
 83:   return(0);
 84: }

 86: /*MC
 87:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 88:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 89:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 90:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 91:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 92:   algorithms are not recommended. This class does NOT support direct solver operations.

 94:   Level: beginner

 96: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 97: M*/

 99: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100: {
102:   PetscInt       n = A->rmap->n;

105:   MatCreate(PetscObjectComm((PetscObject)A),B);
106:   (*B)->factortype = ftype;
107:   MatSetSizes(*B,n,n,n,n);
108:   MatSetType(*B,MATSEQAIJCUSPARSE);

110:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111:     MatSetBlockSizesFromMats(*B,A,A);
112:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

119:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
120:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
121:   return(0);
122: }

124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

129:   switch (op) {
130:   case MAT_CUSPARSE_MULT:
131:     cusparsestruct->format = format;
132:     break;
133:   case MAT_CUSPARSE_ALL:
134:     cusparsestruct->format = format;
135:     break;
136:   default:
137:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138:   }
139:   return(0);
140: }

142: /*@
143:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144:    operation. Only the MatMult operation can use different GPU storage formats
145:    for MPIAIJCUSPARSE matrices.
146:    Not Collective

148:    Input Parameters:
149: +  A - Matrix of type SEQAIJCUSPARSE
150: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

153:    Output Parameter:

155:    Level: intermediate

157: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: @*/
159: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160: {

165:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
166:   return(0);
167: }

169: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170: {
171:   PetscErrorCode           ierr;
172:   MatCUSPARSEStorageFormat format;
173:   PetscBool                flg;
174:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

177:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
178:   if (A->factortype==MAT_FACTOR_NONE) {
179:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
181:     if (flg) {
182:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
183:     }
184:   }
185:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
187:   if (flg) {
188:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
189:   }
190:   PetscOptionsTail();
191:   return(0);

193: }

195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {

200:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202:   return(0);
203: }

205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {

210:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212:   return(0);
213: }

215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {

220:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222:   return(0);
223: }

225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {

230:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232:   return(0);
233: }

235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238:   PetscInt                          n = A->rmap->n;
239:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241:   cusparseStatus_t                  stat;
242:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243:   const MatScalar                   *aa = a->a,*v;
244:   PetscInt                          *AiLo, *AjLo;
245:   PetscScalar                       *AALo;
246:   PetscInt                          i,nz, nzLower, offset, rowOffset;
247:   PetscErrorCode                    ierr;

250:   if (!n) return(0);
251:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
252:     try {
253:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254:       nzLower=n+ai[n]-ai[1];

256:       /* Allocate Space for the lower triangular matrix */
257:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

261:       /* Fill the lower triangular matrix */
262:       AiLo[0]  = (PetscInt) 0;
263:       AiLo[n]  = nzLower;
264:       AjLo[0]  = (PetscInt) 0;
265:       AALo[0]  = (MatScalar) 1.0;
266:       v        = aa;
267:       vi       = aj;
268:       offset   = 1;
269:       rowOffset= 1;
270:       for (i=1; i<n; i++) {
271:         nz = ai[i+1] - ai[i];
272:         /* additional 1 for the term on the diagonal */
273:         AiLo[i]    = rowOffset;
274:         rowOffset += nz+1;

276:         PetscArraycpy(&(AjLo[offset]), vi, nz);
277:         PetscArraycpy(&(AALo[offset]), v, nz);

279:         offset      += nz;
280:         AjLo[offset] = (PetscInt) i;
281:         AALo[offset] = (MatScalar) 1.0;
282:         offset      += 1;

284:         v  += nz;
285:         vi += nz;
286:       }

288:       /* allocate space for the triangular factor information */
289:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

291:       /* Create the matrix description */
292:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

298:       /* Create the solve analysis information */
299:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

301:       /* set the operation */
302:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

304:       /* set the matrix */
305:       loTriFactor->csrMat = new CsrMatrix;
306:       loTriFactor->csrMat->num_rows = n;
307:       loTriFactor->csrMat->num_cols = n;
308:       loTriFactor->csrMat->num_entries = nzLower;

310:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

313:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

316:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

319:       /* perform the solve analysis */
320:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

325:       /* assign the pointer. Is this really necessary? */
326:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

328:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
331:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
332:     } catch(char *ex) {
333:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334:     }
335:   }
336:   return(0);
337: }

339: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340: {
341:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342:   PetscInt                          n = A->rmap->n;
343:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345:   cusparseStatus_t                  stat;
346:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347:   const MatScalar                   *aa = a->a,*v;
348:   PetscInt                          *AiUp, *AjUp;
349:   PetscScalar                       *AAUp;
350:   PetscInt                          i,nz, nzUpper, offset;
351:   PetscErrorCode                    ierr;

354:   if (!n) return(0);
355:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
356:     try {
357:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358:       nzUpper = adiag[0]-adiag[n];

360:       /* Allocate Space for the upper triangular matrix */
361:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

365:       /* Fill the upper triangular matrix */
366:       AiUp[0]=(PetscInt) 0;
367:       AiUp[n]=nzUpper;
368:       offset = nzUpper;
369:       for (i=n-1; i>=0; i--) {
370:         v  = aa + adiag[i+1] + 1;
371:         vi = aj + adiag[i+1] + 1;

373:         /* number of elements NOT on the diagonal */
374:         nz = adiag[i] - adiag[i+1]-1;

376:         /* decrement the offset */
377:         offset -= (nz+1);

379:         /* first, set the diagonal elements */
380:         AjUp[offset] = (PetscInt) i;
381:         AAUp[offset] = (MatScalar)1./v[nz];
382:         AiUp[i]      = AiUp[i+1] - (nz+1);

384:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
385:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
386:       }

388:       /* allocate space for the triangular factor information */
389:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

391:       /* Create the matrix description */
392:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

398:       /* Create the solve analysis information */
399:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

401:       /* set the operation */
402:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

404:       /* set the matrix */
405:       upTriFactor->csrMat = new CsrMatrix;
406:       upTriFactor->csrMat->num_rows = n;
407:       upTriFactor->csrMat->num_cols = n;
408:       upTriFactor->csrMat->num_entries = nzUpper;

410:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

413:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

416:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

419:       /* perform the solve analysis */
420:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

425:       /* assign the pointer. Is this really necessary? */
426:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

428:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
432:     } catch(char *ex) {
433:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434:     }
435:   }
436:   return(0);
437: }

439: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440: {
441:   PetscErrorCode               ierr;
442:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444:   IS                           isrow = a->row,iscol = a->icol;
445:   PetscBool                    row_identity,col_identity;
446:   const PetscInt               *r,*c;
447:   PetscInt                     n = A->rmap->n;

450:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
451:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

453:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454:   cusparseTriFactors->nnz=a->nz;

456:   A->offloadmask = PETSC_OFFLOAD_BOTH;
457:   /* lower triangular indices */
458:   ISGetIndices(isrow,&r);
459:   ISIdentity(isrow,&row_identity);
460:   if (!row_identity) {
461:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462:     cusparseTriFactors->rpermIndices->assign(r, r+n);
463:   }
464:   ISRestoreIndices(isrow,&r);

466:   /* upper triangular indices */
467:   ISGetIndices(iscol,&c);
468:   ISIdentity(iscol,&col_identity);
469:   if (!col_identity) {
470:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471:     cusparseTriFactors->cpermIndices->assign(c, c+n);
472:   }

474:   if (!row_identity && !col_identity) {
475:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
476:   } else if(!row_identity) {
477:     PetscLogCpuToGpu(n*sizeof(PetscInt));
478:   } else if(!col_identity) {
479:     PetscLogCpuToGpu(n*sizeof(PetscInt));
480:   }

482:   ISRestoreIndices(iscol,&c);
483:   return(0);
484: }

486: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487: {
488:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492:   cusparseStatus_t                  stat;
493:   PetscErrorCode                    ierr;
494:   PetscInt                          *AiUp, *AjUp;
495:   PetscScalar                       *AAUp;
496:   PetscScalar                       *AALo;
497:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500:   const MatScalar                   *aa = b->a,*v;

503:   if (!n) return(0);
504:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505:     try {
506:       /* Allocate Space for the upper triangular matrix */
507:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

512:       /* Fill the upper triangular matrix */
513:       AiUp[0]=(PetscInt) 0;
514:       AiUp[n]=nzUpper;
515:       offset = 0;
516:       for (i=0; i<n; i++) {
517:         /* set the pointers */
518:         v  = aa + ai[i];
519:         vj = aj + ai[i];
520:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

522:         /* first, set the diagonal elements */
523:         AjUp[offset] = (PetscInt) i;
524:         AAUp[offset] = (MatScalar)1.0/v[nz];
525:         AiUp[i]      = offset;
526:         AALo[offset] = (MatScalar)1.0/v[nz];

528:         offset+=1;
529:         if (nz>0) {
530:           PetscArraycpy(&(AjUp[offset]), vj, nz);
531:           PetscArraycpy(&(AAUp[offset]), v, nz);
532:           for (j=offset; j<offset+nz; j++) {
533:             AAUp[j] = -AAUp[j];
534:             AALo[j] = AAUp[j]/v[nz];
535:           }
536:           offset+=nz;
537:         }
538:       }

540:       /* allocate space for the triangular factor information */
541:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

543:       /* Create the matrix description */
544:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

550:       /* Create the solve analysis information */
551:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

553:       /* set the operation */
554:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

556:       /* set the matrix */
557:       upTriFactor->csrMat = new CsrMatrix;
558:       upTriFactor->csrMat->num_rows = A->rmap->n;
559:       upTriFactor->csrMat->num_cols = A->cmap->n;
560:       upTriFactor->csrMat->num_entries = a->nz;

562:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

565:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

568:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

571:       /* perform the solve analysis */
572:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

577:       /* assign the pointer. Is this really necessary? */
578:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

580:       /* allocate space for the triangular factor information */
581:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

583:       /* Create the matrix description */
584:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

590:       /* Create the solve analysis information */
591:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

593:       /* set the operation */
594:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

596:       /* set the matrix */
597:       loTriFactor->csrMat = new CsrMatrix;
598:       loTriFactor->csrMat->num_rows = A->rmap->n;
599:       loTriFactor->csrMat->num_cols = A->cmap->n;
600:       loTriFactor->csrMat->num_entries = a->nz;

602:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

605:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

608:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

612:       /* perform the solve analysis */
613:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

618:       /* assign the pointer. Is this really necessary? */
619:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

621:       A->offloadmask = PETSC_OFFLOAD_BOTH;
622:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
626:     } catch(char *ex) {
627:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628:     }
629:   }
630:   return(0);
631: }

633: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634: {
635:   PetscErrorCode               ierr;
636:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638:   IS                           ip = a->row;
639:   const PetscInt               *rip;
640:   PetscBool                    perm_identity;
641:   PetscInt                     n = A->rmap->n;

644:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
645:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

648:   /* lower triangular indices */
649:   ISGetIndices(ip,&rip);
650:   ISIdentity(ip,&perm_identity);
651:   if (!perm_identity) {
652:     IS             iip;
653:     const PetscInt *irip;

655:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
656:     ISGetIndices(iip,&irip);
657:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661:     ISRestoreIndices(iip,&irip);
662:     ISDestroy(&iip);
663:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
664:   }
665:   ISRestoreIndices(ip,&rip);
666:   return(0);
667: }

669: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670: {
671:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
672:   IS             isrow = b->row,iscol = b->col;
673:   PetscBool      row_identity,col_identity;

677:   MatLUFactorNumeric_SeqAIJ(B,A,info);
678:   /* determine which version of MatSolve needs to be used. */
679:   ISIdentity(isrow,&row_identity);
680:   ISIdentity(iscol,&col_identity);
681:   if (row_identity && col_identity) {
682:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684:     B->ops->matsolve = NULL;
685:     B->ops->matsolvetranspose = NULL;
686:   } else {
687:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689:     B->ops->matsolve = NULL;
690:     B->ops->matsolvetranspose = NULL;
691:   }

693:   /* get the triangular factors */
694:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
695:   return(0);
696: }

698: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699: {
700:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
701:   IS             ip = b->row;
702:   PetscBool      perm_identity;

706:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

708:   /* determine which version of MatSolve needs to be used. */
709:   ISIdentity(ip,&perm_identity);
710:   if (perm_identity) {
711:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713:     B->ops->matsolve = NULL;
714:     B->ops->matsolvetranspose = NULL;
715:   } else {
716:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718:     B->ops->matsolve = NULL;
719:     B->ops->matsolvetranspose = NULL;
720:   }

722:   /* get the triangular factors */
723:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
724:   return(0);
725: }

727: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728: {
729:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734:   cusparseStatus_t                  stat;
735:   cusparseIndexBase_t               indexBase;
736:   cusparseMatrixType_t              matrixType;
737:   cusparseFillMode_t                fillMode;
738:   cusparseDiagType_t                diagType;


742:   /*********************************************/
743:   /* Now the Transpose of the Lower Tri Factor */
744:   /*********************************************/

746:   /* allocate space for the transpose of the lower triangular factor */
747:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

749:   /* set the matrix descriptors of the lower triangular factor */
750:   matrixType = cusparseGetMatType(loTriFactor->descr);
751:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

756:   /* Create the matrix description */
757:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

763:   /* Create the solve analysis information */
764:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

766:   /* set the operation */
767:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

769:   /* allocate GPU space for the CSC of the lower triangular factor*/
770:   loTriFactorT->csrMat = new CsrMatrix;
771:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

778:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
779:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781:                           loTriFactor->csrMat->values->data().get(),
782:                           loTriFactor->csrMat->row_offsets->data().get(),
783:                           loTriFactor->csrMat->column_indices->data().get(),
784:                           loTriFactorT->csrMat->values->data().get(),
785:                           loTriFactorT->csrMat->column_indices->data().get(),
786:                           loTriFactorT->csrMat->row_offsets->data().get(),
787:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

789:   /* perform the solve analysis on the transposed matrix */
790:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

796:   /* assign the pointer. Is this really necessary? */
797:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

799:   /*********************************************/
800:   /* Now the Transpose of the Upper Tri Factor */
801:   /*********************************************/

803:   /* allocate space for the transpose of the upper triangular factor */
804:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

806:   /* set the matrix descriptors of the upper triangular factor */
807:   matrixType = cusparseGetMatType(upTriFactor->descr);
808:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

813:   /* Create the matrix description */
814:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

820:   /* Create the solve analysis information */
821:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

823:   /* set the operation */
824:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

826:   /* allocate GPU space for the CSC of the upper triangular factor*/
827:   upTriFactorT->csrMat = new CsrMatrix;
828:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

835:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
836:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838:                           upTriFactor->csrMat->values->data().get(),
839:                           upTriFactor->csrMat->row_offsets->data().get(),
840:                           upTriFactor->csrMat->column_indices->data().get(),
841:                           upTriFactorT->csrMat->values->data().get(),
842:                           upTriFactorT->csrMat->column_indices->data().get(),
843:                           upTriFactorT->csrMat->row_offsets->data().get(),
844:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

846:   /* perform the solve analysis on the transposed matrix */
847:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

853:   /* assign the pointer. Is this really necessary? */
854:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855:   return(0);
856: }

858: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859: {
860:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
864:   cusparseStatus_t             stat;
865:   cusparseIndexBase_t          indexBase;
866:   cudaError_t                  err;
867:   PetscErrorCode               ierr;


871:   /* allocate space for the triangular factor information */
872:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
875:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

878:   /* set alpha and beta */
879:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
880:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

887:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889:     CsrMatrix *matrixT= new CsrMatrix;
890:     matrixT->num_rows = A->cmap->n;
891:     matrixT->num_cols = A->rmap->n;
892:     matrixT->num_entries = a->nz;
893:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
894:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
895:     matrixT->values = new THRUSTARRAY(a->nz);

897:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
898:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
899:     /* compute the transpose, i.e. the CSC */
900:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
901:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
902:                             A->cmap->n, matrix->num_entries,
903:                             matrix->values->data().get(),
904:                             cusparsestruct->rowoffsets_gpu->data().get(),
905:                             matrix->column_indices->data().get(),
906:                             matrixT->values->data().get(),
907:                             matrixT->column_indices->data().get(),
908:                             matrixT->row_offsets->data().get(),
909:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
910:     /* assign the pointer */
911:     matstructT->mat = matrixT;
912:     PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
913:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
914:     /* First convert HYB to CSR */
915:     CsrMatrix *temp= new CsrMatrix;
916:     temp->num_rows = A->rmap->n;
917:     temp->num_cols = A->cmap->n;
918:     temp->num_entries = a->nz;
919:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
921:     temp->values = new THRUSTARRAY(a->nz);


924:     stat = cusparse_hyb2csr(cusparsestruct->handle,
925:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
926:                             temp->values->data().get(),
927:                             temp->row_offsets->data().get(),
928:                             temp->column_indices->data().get());CHKERRCUDA(stat);

930:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
931:     CsrMatrix *tempT= new CsrMatrix;
932:     tempT->num_rows = A->rmap->n;
933:     tempT->num_cols = A->cmap->n;
934:     tempT->num_entries = a->nz;
935:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
937:     tempT->values = new THRUSTARRAY(a->nz);

939:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
940:                             temp->num_cols, temp->num_entries,
941:                             temp->values->data().get(),
942:                             temp->row_offsets->data().get(),
943:                             temp->column_indices->data().get(),
944:                             tempT->values->data().get(),
945:                             tempT->column_indices->data().get(),
946:                             tempT->row_offsets->data().get(),
947:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

949:     /* Last, convert CSC to HYB */
950:     cusparseHybMat_t hybMat;
951:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
952:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
953:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
954:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
955:                             matstructT->descr, tempT->values->data().get(),
956:                             tempT->row_offsets->data().get(),
957:                             tempT->column_indices->data().get(),
958:                             hybMat, 0, partition);CHKERRCUDA(stat);

960:     /* assign the pointer */
961:     matstructT->mat = hybMat;
962:     PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));

964:     /* delete temporaries */
965:     if (tempT) {
966:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
967:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
968:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
969:       delete (CsrMatrix*) tempT;
970:     }
971:     if (temp) {
972:       if (temp->values) delete (THRUSTARRAY*) temp->values;
973:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
974:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
975:       delete (CsrMatrix*) temp;
976:     }
977:   }
978:   /* assign the compressed row indices */
979:   matstructT->cprowIndices = new THRUSTINTARRAY;
980:   matstructT->cprowIndices->resize(A->cmap->n);
981:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
982:   /* assign the pointer */
983:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
984:   return(0);
985: }

987: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
988: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
989: {
990:   PetscInt                              n = xx->map->n;
991:   const PetscScalar                     *barray;
992:   PetscScalar                           *xarray;
993:   thrust::device_ptr<const PetscScalar> bGPU;
994:   thrust::device_ptr<PetscScalar>       xGPU;
995:   cusparseStatus_t                      stat;
996:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
997:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
998:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
999:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1000:   PetscErrorCode                        ierr;

1003:   /* Analyze the matrix and create the transpose ... on the fly */
1004:   if (!loTriFactorT && !upTriFactorT) {
1005:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1006:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008:   }

1010:   /* Get the GPU pointers */
1011:   VecCUDAGetArrayWrite(xx,&xarray);
1012:   VecCUDAGetArrayRead(bb,&barray);
1013:   xGPU = thrust::device_pointer_cast(xarray);
1014:   bGPU = thrust::device_pointer_cast(barray);

1016:   PetscLogGpuTimeBegin();
1017:   /* First, reorder with the row permutation */
1018:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1019:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1020:                xGPU);

1022:   /* First, solve U */
1023:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1024:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1025:                         upTriFactorT->csrMat->values->data().get(),
1026:                         upTriFactorT->csrMat->row_offsets->data().get(),
1027:                         upTriFactorT->csrMat->column_indices->data().get(),
1028:                         upTriFactorT->solveInfo,
1029:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1031:   /* Then, solve L */
1032:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1033:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1034:                         loTriFactorT->csrMat->values->data().get(),
1035:                         loTriFactorT->csrMat->row_offsets->data().get(),
1036:                         loTriFactorT->csrMat->column_indices->data().get(),
1037:                         loTriFactorT->solveInfo,
1038:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1040:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1041:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1042:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1043:                tempGPU->begin());

1045:   /* Copy the temporary to the full solution. */
1046:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1048:   /* restore */
1049:   VecCUDARestoreArrayRead(bb,&barray);
1050:   VecCUDARestoreArrayWrite(xx,&xarray);
1051:   WaitForGPU();CHKERRCUDA(ierr);
1052:   PetscLogGpuTimeEnd();
1053:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1054:   return(0);
1055: }

1057: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058: {
1059:   const PetscScalar                 *barray;
1060:   PetscScalar                       *xarray;
1061:   cusparseStatus_t                  stat;
1062:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066:   PetscErrorCode                    ierr;

1069:   /* Analyze the matrix and create the transpose ... on the fly */
1070:   if (!loTriFactorT && !upTriFactorT) {
1071:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1072:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074:   }

1076:   /* Get the GPU pointers */
1077:   VecCUDAGetArrayWrite(xx,&xarray);
1078:   VecCUDAGetArrayRead(bb,&barray);

1080:   PetscLogGpuTimeBegin();
1081:   /* First, solve U */
1082:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1083:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1084:                         upTriFactorT->csrMat->values->data().get(),
1085:                         upTriFactorT->csrMat->row_offsets->data().get(),
1086:                         upTriFactorT->csrMat->column_indices->data().get(),
1087:                         upTriFactorT->solveInfo,
1088:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1090:   /* Then, solve L */
1091:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1093:                         loTriFactorT->csrMat->values->data().get(),
1094:                         loTriFactorT->csrMat->row_offsets->data().get(),
1095:                         loTriFactorT->csrMat->column_indices->data().get(),
1096:                         loTriFactorT->solveInfo,
1097:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1099:   /* restore */
1100:   VecCUDARestoreArrayRead(bb,&barray);
1101:   VecCUDARestoreArrayWrite(xx,&xarray);
1102:   WaitForGPU();CHKERRCUDA(ierr);
1103:   PetscLogGpuTimeEnd();
1104:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1105:   return(0);
1106: }

1108: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109: {
1110:   const PetscScalar                     *barray;
1111:   PetscScalar                           *xarray;
1112:   thrust::device_ptr<const PetscScalar> bGPU;
1113:   thrust::device_ptr<PetscScalar>       xGPU;
1114:   cusparseStatus_t                      stat;
1115:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1116:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1117:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1118:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1119:   PetscErrorCode                        ierr;


1123:   /* Get the GPU pointers */
1124:   VecCUDAGetArrayWrite(xx,&xarray);
1125:   VecCUDAGetArrayRead(bb,&barray);
1126:   xGPU = thrust::device_pointer_cast(xarray);
1127:   bGPU = thrust::device_pointer_cast(barray);

1129:   PetscLogGpuTimeBegin();
1130:   /* First, reorder with the row permutation */
1131:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1132:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1133:                tempGPU->begin());

1135:   /* Next, solve L */
1136:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1137:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1138:                         loTriFactor->csrMat->values->data().get(),
1139:                         loTriFactor->csrMat->row_offsets->data().get(),
1140:                         loTriFactor->csrMat->column_indices->data().get(),
1141:                         loTriFactor->solveInfo,
1142:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1144:   /* Then, solve U */
1145:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1146:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1147:                         upTriFactor->csrMat->values->data().get(),
1148:                         upTriFactor->csrMat->row_offsets->data().get(),
1149:                         upTriFactor->csrMat->column_indices->data().get(),
1150:                         upTriFactor->solveInfo,
1151:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1153:   /* Last, reorder with the column permutation */
1154:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1155:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1156:                xGPU);

1158:   VecCUDARestoreArrayRead(bb,&barray);
1159:   VecCUDARestoreArrayWrite(xx,&xarray);
1160:   WaitForGPU();CHKERRCUDA(ierr);
1161:   PetscLogGpuTimeEnd();
1162:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1163:   return(0);
1164: }

1166: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1167: {
1168:   const PetscScalar                 *barray;
1169:   PetscScalar                       *xarray;
1170:   cusparseStatus_t                  stat;
1171:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1172:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1173:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1174:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1175:   PetscErrorCode                    ierr;

1178:   /* Get the GPU pointers */
1179:   VecCUDAGetArrayWrite(xx,&xarray);
1180:   VecCUDAGetArrayRead(bb,&barray);

1182:   PetscLogGpuTimeBegin();
1183:   /* First, solve L */
1184:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1186:                         loTriFactor->csrMat->values->data().get(),
1187:                         loTriFactor->csrMat->row_offsets->data().get(),
1188:                         loTriFactor->csrMat->column_indices->data().get(),
1189:                         loTriFactor->solveInfo,
1190:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1192:   /* Next, solve U */
1193:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1195:                         upTriFactor->csrMat->values->data().get(),
1196:                         upTriFactor->csrMat->row_offsets->data().get(),
1197:                         upTriFactor->csrMat->column_indices->data().get(),
1198:                         upTriFactor->solveInfo,
1199:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1201:   VecCUDARestoreArrayRead(bb,&barray);
1202:   VecCUDARestoreArrayWrite(xx,&xarray);
1203:   WaitForGPU();CHKERRCUDA(ierr);
1204:   PetscLogGpuTimeEnd();
1205:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1206:   return(0);
1207: }

1209: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1210: {
1211:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1212:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1213:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1214:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1215:   PetscErrorCode               ierr;
1216:   cusparseStatus_t             stat;
1217:   cudaError_t                  err;

1220:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1221:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1222:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1223:       /* Copy values only */
1224:       CsrMatrix *mat,*matT;
1225:       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1226:       mat->values->assign(a->a, a->a+a->nz);
1227:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));

1229:       /* Update matT when it was built before */
1230:       if (cusparsestruct->matTranspose) {
1231:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1232:         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1233:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1234:                                  A->cmap->n, mat->num_entries,
1235:                                  mat->values->data().get(),
1236:                                  cusparsestruct->rowoffsets_gpu->data().get(),
1237:                                  mat->column_indices->data().get(),
1238:                                  matT->values->data().get(),
1239:                                  matT->column_indices->data().get(),
1240:                                  matT->row_offsets->data().get(),
1241:                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUDA(stat);
1242:       }
1243:     } else {
1244:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1245:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1246:       delete cusparsestruct->workVector;
1247:       delete cusparsestruct->rowoffsets_gpu;
1248:       try {
1249:         cusparsestruct->nonzerorow=0;
1250:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1252:         if (a->compressedrow.use) {
1253:           m    = a->compressedrow.nrows;
1254:           ii   = a->compressedrow.i;
1255:           ridx = a->compressedrow.rindex;
1256:         } else {
1257:           /* Forcing compressed row on the GPU */
1258:           int k=0;
1259:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1260:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1261:           ii[0]=0;
1262:           for (int j = 0; j<m; j++) {
1263:             if ((a->i[j+1]-a->i[j])>0) {
1264:               ii[k]  = a->i[j];
1265:               ridx[k]= j;
1266:               k++;
1267:             }
1268:           }
1269:           ii[cusparsestruct->nonzerorow] = a->nz;
1270:           m = cusparsestruct->nonzerorow;
1271:         }

1273:         /* allocate space for the triangular factor information */
1274:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1275:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1276:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1277:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1279:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1280:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1281:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1282:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1284:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1285:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1287:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1288:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1289:           /* set the matrix */
1290:           CsrMatrix *matrix= new CsrMatrix;
1291:           matrix->num_rows = m;
1292:           matrix->num_cols = A->cmap->n;
1293:           matrix->num_entries = a->nz;
1294:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1295:           matrix->row_offsets->assign(ii, ii + m+1);

1297:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1298:           matrix->column_indices->assign(a->j, a->j+a->nz);

1300:           matrix->values = new THRUSTARRAY(a->nz);
1301:           matrix->values->assign(a->a, a->a+a->nz);

1303:           /* assign the pointer */
1304:           matstruct->mat = matrix;

1306:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1307:           CsrMatrix *matrix= new CsrMatrix;
1308:           matrix->num_rows = m;
1309:           matrix->num_cols = A->cmap->n;
1310:           matrix->num_entries = a->nz;
1311:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1312:           matrix->row_offsets->assign(ii, ii + m+1);

1314:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1315:           matrix->column_indices->assign(a->j, a->j+a->nz);

1317:           matrix->values = new THRUSTARRAY(a->nz);
1318:           matrix->values->assign(a->a, a->a+a->nz);

1320:           cusparseHybMat_t hybMat;
1321:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1322:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1323:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1324:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1325:               matstruct->descr, matrix->values->data().get(),
1326:               matrix->row_offsets->data().get(),
1327:               matrix->column_indices->data().get(),
1328:               hybMat, 0, partition);CHKERRCUDA(stat);
1329:           /* assign the pointer */
1330:           matstruct->mat = hybMat;

1332:           if (matrix) {
1333:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1334:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1335:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1336:             delete (CsrMatrix*)matrix;
1337:           }
1338:         }

1340:         /* assign the compressed row indices */
1341:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1342:         matstruct->cprowIndices->assign(ridx,ridx+m);
1343:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1345:         /* assign the pointer */
1346:         cusparsestruct->mat = matstruct;

1348:         if (!a->compressedrow.use) {
1349:           PetscFree(ii);
1350:           PetscFree(ridx);
1351:         }
1352:         cusparsestruct->workVector = new THRUSTARRAY(m);
1353:       } catch(char *ex) {
1354:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1355:       }
1356:       cusparsestruct->nonzerostate = A->nonzerostate;
1357:     }
1358:     WaitForGPU();CHKERRCUDA(ierr);
1359:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1360:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1361:   }
1362:   return(0);
1363: }

1365: struct VecCUDAPlusEquals
1366: {
1367:   template <typename Tuple>
1368:   __host__ __device__
1369:   void operator()(Tuple t)
1370:   {
1371:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372:   }
1373: };

1375: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1376: {

1380:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1381:   return(0);
1382: }

1384: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1385: {
1386:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1387:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1388:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1389:   const PetscScalar            *xarray;
1390:   PetscScalar                  *yarray;
1391:   PetscErrorCode               ierr;
1392:   cusparseStatus_t             stat;

1395:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1396:   MatSeqAIJCUSPARSECopyToGPU(A);
1397:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398:   if (!matstructT) {
1399:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1400:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1401:   }
1402:   VecCUDAGetArrayRead(xx,&xarray);
1403:   VecCUDAGetArrayWrite(yy,&yarray);
1404:   if (yy->map->n) {
1405:     PetscInt                     n = yy->map->n;
1406:     cudaError_t                  err;
1407:     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1408:   }

1410:   PetscLogGpuTimeBegin();
1411:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414:                              mat->num_rows, mat->num_cols,
1415:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1416:                              mat->values->data().get(), mat->row_offsets->data().get(),
1417:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1418:                              yarray);CHKERRCUDA(stat);
1419:   } else {
1420:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1421:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422:                              matstructT->alpha, matstructT->descr, hybMat,
1423:                              xarray, matstructT->beta_zero,
1424:                              yarray);CHKERRCUDA(stat);
1425:   }
1426:   VecCUDARestoreArrayRead(xx,&xarray);
1427:   VecCUDARestoreArrayWrite(yy,&yarray);
1428:   WaitForGPU();CHKERRCUDA(ierr);
1429:   PetscLogGpuTimeEnd();
1430:   PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1431:   return(0);
1432: }


1435: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1436: {
1437:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1438:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1439:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1440:   const PetscScalar            *xarray;
1441:   PetscScalar                  *zarray,*dptr,*beta;
1442:   PetscErrorCode               ierr;
1443:   cusparseStatus_t             stat;
1444:   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */

1447:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1448:   MatSeqAIJCUSPARSECopyToGPU(A);
1449:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1450:   try {
1451:     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1452:     VecCUDAGetArrayRead(xx,&xarray);
1453:     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1454:       VecCUDAGetArray(zz,&zarray);
1455:     } else {
1456:       VecCUDAGetArrayWrite(zz,&zarray);
1457:     }
1458:     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1459:     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;

1461:     PetscLogGpuTimeBegin();
1462:     /* csr_spmv is multiply add */
1463:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1464:       /* here we need to be careful to set the number of rows in the multiply to the
1465:          number of compressed rows in the matrix ... which is equivalent to the
1466:          size of the workVector */
1467:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1468:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469:                                mat->num_rows, mat->num_cols,
1470:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1471:                                mat->values->data().get(), mat->row_offsets->data().get(),
1472:                                mat->column_indices->data().get(), xarray, beta,
1473:                                dptr);CHKERRCUDA(stat);
1474:     } else {
1475:       if (cusparsestruct->workVector->size()) {
1476:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1477:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478:                                  matstruct->alpha, matstruct->descr, hybMat,
1479:                                  xarray, beta,
1480:                                  dptr);CHKERRCUDA(stat);
1481:       }
1482:     }
1483:     PetscLogGpuTimeEnd();

1485:     if (yy) {
1486:       if (dptr != zarray) {
1487:         VecCopy_SeqCUDA(yy,zz);
1488:       } else if (zz != yy) {
1489:         VecAXPY_SeqCUDA(zz,1.0,yy);
1490:       }
1491:     } else if (dptr != zarray) {
1492:       VecSet_SeqCUDA(zz,0);
1493:     }
1494:     /* scatter the data from the temporary into the full vector with a += operation */
1495:     PetscLogGpuTimeBegin();
1496:     if (dptr != zarray) {
1497:       thrust::device_ptr<PetscScalar> zptr;

1499:       zptr = thrust::device_pointer_cast(zarray);
1500:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1501:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1502:                        VecCUDAPlusEquals());
1503:     }
1504:     PetscLogGpuTimeEnd();
1505:     VecCUDARestoreArrayRead(xx,&xarray);
1506:     if (yy && !cmpr) {
1507:       VecCUDARestoreArray(zz,&zarray);
1508:     } else {
1509:       VecCUDARestoreArrayWrite(zz,&zarray);
1510:     }
1511:   } catch(char *ex) {
1512:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1513:   }
1514:   WaitForGPU();CHKERRCUDA(ierr);
1515:   PetscLogGpuFlops(2.0*a->nz);
1516:   return(0);
1517: }

1519: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1520: {
1521:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1522:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1523:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1524:   const PetscScalar               *xarray;
1525:   PetscScalar                     *zarray,*beta;
1526:   PetscErrorCode                  ierr;
1527:   cusparseStatus_t                stat;

1530:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1531:   MatSeqAIJCUSPARSECopyToGPU(A);
1532:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1533:   if (!matstructT) {
1534:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1535:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536:   }

1538:   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1539:   try {
1540:     VecCopy_SeqCUDA(yy,zz);
1541:     VecCUDAGetArrayRead(xx,&xarray);
1542:     VecCUDAGetArray(zz,&zarray);
1543:     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;

1545:     PetscLogGpuTimeBegin();
1546:     /* multiply add with matrix transpose */
1547:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1548:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1549:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1550:                                mat->num_rows, mat->num_cols,
1551:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1552:                                mat->values->data().get(), mat->row_offsets->data().get(),
1553:                                mat->column_indices->data().get(), xarray, beta,
1554:                                zarray);CHKERRCUDA(stat);
1555:     } else {
1556:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1557:       if (cusparsestruct->workVector->size()) {
1558:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1559:                                  matstructT->alpha, matstructT->descr, hybMat,
1560:                                  xarray, beta,
1561:                                  zarray);CHKERRCUDA(stat);
1562:       }
1563:     }
1564:     PetscLogGpuTimeEnd();

1566:     if (zz != yy) {VecAXPY_SeqCUDA(zz,1.0,yy);}
1567:     VecCUDARestoreArrayRead(xx,&xarray);
1568:     VecCUDARestoreArray(zz,&zarray);
1569:   } catch(char *ex) {
1570:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1571:   }
1572:   WaitForGPU();CHKERRCUDA(ierr);
1573:   PetscLogGpuFlops(2.0*a->nz);
1574:   return(0);
1575: }

1577: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1578: {

1582:   MatAssemblyEnd_SeqAIJ(A,mode);
1583:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1584:   if (A->factortype == MAT_FACTOR_NONE) {
1585:     MatSeqAIJCUSPARSECopyToGPU(A);
1586:   }
1587:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1588:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1589:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1590:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1591:   return(0);
1592: }

1594: /* --------------------------------------------------------------------------------*/
1595: /*@
1596:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1597:    (the default parallel PETSc format). This matrix will ultimately pushed down
1598:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1599:    assembly performance the user should preallocate the matrix storage by setting
1600:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1601:    performance during matrix assembly can be increased by more than a factor of 50.

1603:    Collective

1605:    Input Parameters:
1606: +  comm - MPI communicator, set to PETSC_COMM_SELF
1607: .  m - number of rows
1608: .  n - number of columns
1609: .  nz - number of nonzeros per row (same for all rows)
1610: -  nnz - array containing the number of nonzeros in the various rows
1611:          (possibly different for each row) or NULL

1613:    Output Parameter:
1614: .  A - the matrix

1616:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1617:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1618:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1620:    Notes:
1621:    If nnz is given then nz is ignored

1623:    The AIJ format (also called the Yale sparse matrix format or
1624:    compressed row storage), is fully compatible with standard Fortran 77
1625:    storage.  That is, the stored row and column indices can begin at
1626:    either one (as in Fortran) or zero.  See the users' manual for details.

1628:    Specify the preallocated storage with either nz or nnz (not both).
1629:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1630:    allocation.  For large problems you MUST preallocate memory or you
1631:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1633:    By default, this format uses inodes (identical nodes) when possible, to
1634:    improve numerical efficiency of matrix-vector products and solves. We
1635:    search for consecutive rows with the same nonzero structure, thereby
1636:    reusing matrix information to achieve increased efficiency.

1638:    Level: intermediate

1640: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1641: @*/
1642: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1643: {

1647:   MatCreate(comm,A);
1648:   MatSetSizes(*A,m,n,m,n);
1649:   MatSetType(*A,MATSEQAIJCUSPARSE);
1650:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1651:   return(0);
1652: }

1654: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1655: {
1656:   PetscErrorCode   ierr;

1659:   if (A->factortype==MAT_FACTOR_NONE) {
1660:     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1661:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1662:     }
1663:   } else {
1664:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1665:   }
1666:   MatDestroy_SeqAIJ(A);
1667:   return(0);
1668: }

1670: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1671: {
1673:   Mat C;
1674:   cusparseStatus_t stat;
1675:   cusparseHandle_t handle=0;

1678:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1679:   C    = *B;
1680:   PetscFree(C->defaultvectype);
1681:   PetscStrallocpy(VECCUDA,&C->defaultvectype);

1683:   /* inject CUSPARSE-specific stuff */
1684:   if (C->factortype==MAT_FACTOR_NONE) {
1685:     /* you cannot check the inode.use flag here since the matrix was just created.
1686:        now build a GPU matrix data structure */
1687:     C->spptr = new Mat_SeqAIJCUSPARSE;
1688:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat            = 0;
1689:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose   = 0;
1690:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector     = 0;
1691:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1692:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format         = MAT_CUSPARSE_CSR;
1693:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream         = 0;
1694:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1695:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle         = handle;
1696:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate   = 0;
1697:   } else {
1698:     /* NEXT, set the pointers to the triangular factors */
1699:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1700:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1701:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1702:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1703:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1704:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1705:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1706:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1707:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1708:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1709:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1710:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1711:   }

1713:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1714:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1715:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1716:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1717:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1718:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1719:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1720:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1722:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1724:   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1726:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1727:   return(0);
1728: }

1730: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1731: {
1733:   cusparseStatus_t stat;
1734:   cusparseHandle_t handle=0;

1737:   PetscFree(B->defaultvectype);
1738:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1740:   if (B->factortype==MAT_FACTOR_NONE) {
1741:     /* you cannot check the inode.use flag here since the matrix was just created.
1742:        now build a GPU matrix data structure */
1743:     B->spptr = new Mat_SeqAIJCUSPARSE;
1744:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1745:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1746:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1747:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1748:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1749:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1750:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1751:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1752:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1753:   } else {
1754:     /* NEXT, set the pointers to the triangular factors */
1755:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1756:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1757:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1758:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1759:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1760:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1761:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1762:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1763:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1764:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1765:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1766:   }

1768:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1769:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1770:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1771:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1772:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1773:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1774:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1775:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1777:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1779:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1781:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1782:   return(0);
1783: }

1785: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1786: {

1790:   MatCreate_SeqAIJ(B);
1791:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1792:   return(0);
1793: }

1795: /*MC
1796:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1798:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1799:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1800:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1802:    Options Database Keys:
1803: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1804: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1805: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1807:   Level: beginner

1809: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1810: M*/

1812: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1815: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1816: {

1820:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1821:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1822:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1823:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1824:   return(0);
1825: }


1828: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1829: {
1830:   cusparseStatus_t stat;
1831:   cusparseHandle_t handle;

1834:   if (*cusparsestruct) {
1835:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1836:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1837:     delete (*cusparsestruct)->workVector;
1838:     delete (*cusparsestruct)->rowoffsets_gpu;
1839:     if (handle = (*cusparsestruct)->handle) {
1840:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1841:     }
1842:     delete *cusparsestruct;
1843:     *cusparsestruct = 0;
1844:   }
1845:   return(0);
1846: }

1848: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1849: {
1851:   if (*mat) {
1852:     delete (*mat)->values;
1853:     delete (*mat)->column_indices;
1854:     delete (*mat)->row_offsets;
1855:     delete *mat;
1856:     *mat = 0;
1857:   }
1858:   return(0);
1859: }

1861: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1862: {
1863:   cusparseStatus_t stat;
1864:   PetscErrorCode   ierr;

1867:   if (*trifactor) {
1868:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1869:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1870:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1871:     delete *trifactor;
1872:     *trifactor = 0;
1873:   }
1874:   return(0);
1875: }

1877: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1878: {
1879:   CsrMatrix        *mat;
1880:   cusparseStatus_t stat;
1881:   cudaError_t      err;

1884:   if (*matstruct) {
1885:     if ((*matstruct)->mat) {
1886:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1887:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1888:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1889:       } else {
1890:         mat = (CsrMatrix*)(*matstruct)->mat;
1891:         CsrMatrix_Destroy(&mat);
1892:       }
1893:     }
1894:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1895:     delete (*matstruct)->cprowIndices;
1896:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1897:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1898:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1899:     delete *matstruct;
1900:     *matstruct = 0;
1901:   }
1902:   return(0);
1903: }

1905: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1906: {
1907:   cusparseHandle_t handle;
1908:   cusparseStatus_t stat;

1911:   if (*trifactors) {
1912:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1913:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1914:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1915:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1916:     delete (*trifactors)->rpermIndices;
1917:     delete (*trifactors)->cpermIndices;
1918:     delete (*trifactors)->workVector;
1919:     if (handle = (*trifactors)->handle) {
1920:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1921:     }
1922:     delete *trifactors;
1923:     *trifactors = 0;
1924:   }
1925:   return(0);
1926: }