Actual source code: aijcusparse.cu

petsc-3.5.4 2015-05-23
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */

  6: #include <petscconf.h>
  7: #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
  8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc-private/vecimpl.h>
 11: #undef VecType
 12: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 14: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 16: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 17: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 20: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 24: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 28: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
 29: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 31: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 32: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 36: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 37: {
 38:   cusparseStatus_t   stat;
 39:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 42:   cusparsestruct->stream = stream;
 43:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
 44:   return(0);
 45: }

 49: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 50: {
 51:   cusparseStatus_t   stat;
 52:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 55:   if (cusparsestruct->handle)
 56:     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
 57:   cusparsestruct->handle = handle;
 58:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
 59:   return(0);
 60: }

 64: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 65: {
 66:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 68:   if (cusparsestruct->handle)
 69:     cusparsestruct->handle = 0;
 70:   return(0);
 71: }

 75: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
 76: {
 78:   *type = MATSOLVERCUSPARSE;
 79:   return(0);
 80: }

 82: /*MC
 83:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 84:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 85:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 86:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 87:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 88:   algorithms are not recommended. This class does NOT support direct solver operations.

 90:   Level: beginner

 92: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 93: M*/

 97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
 98: {
100:   PetscInt       n = A->rmap->n;

103:   MatCreate(PetscObjectComm((PetscObject)A),B);
104:   (*B)->factortype = ftype;
105:   MatSetSizes(*B,n,n,n,n);
106:   MatSetType(*B,MATSEQAIJCUSPARSE);

108:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109:     MatSetBlockSizesFromMats(*B,A,A);
110:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

117:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
119:   return(0);
120: }

124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

129: #if CUDA_VERSION>=4020
130:   switch (op) {
131:   case MAT_CUSPARSE_MULT:
132:     cusparsestruct->format = format;
133:     break;
134:   case MAT_CUSPARSE_ALL:
135:     cusparsestruct->format = format;
136:     break;
137:   default:
138:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139:   }
140: #else
141:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143: #endif
144:   return(0);
145: }

147: /*@
148:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149:    operation. Only the MatMult operation can use different GPU storage formats
150:    for MPIAIJCUSPARSE matrices.
151:    Not Collective

153:    Input Parameters:
154: +  A - Matrix of type SEQAIJCUSPARSE
155: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

158:    Output Parameter:

160:    Level: intermediate

162: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163: @*/
166: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167: {

172:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
173:   return(0);
174: }

178: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179: {
180:   PetscErrorCode           ierr;
181:   MatCUSPARSEStorageFormat format;
182:   PetscBool                flg;

185:   PetscOptionsHead("SeqAIJCUSPARSE options");
186:   PetscObjectOptionsBegin((PetscObject)A);
187:   if (A->factortype==MAT_FACTOR_NONE) {
188:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
189:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
190:     if (flg) {
191:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
192:     }
193:   }
194:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
195:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
196:   if (flg) {
197:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
198:   }
199:   PetscOptionsEnd();
200:   return(0);

202: }

206: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207: {

211:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
212:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213:   return(0);
214: }

218: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
219: {

223:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
224:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
225:   return(0);
226: }

230: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
231: {

235:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
236:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
237:   return(0);
238: }

242: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
243: {

247:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
248:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
249:   return(0);
250: }

254: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
255: {
256:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
257:   PetscInt                          n = A->rmap->n;
258:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
259:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
260:   cusparseStatus_t                  stat;
261:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
262:   const MatScalar                   *aa = a->a,*v;
263:   PetscInt                          *AiLo, *AjLo;
264:   PetscScalar                       *AALo;
265:   PetscInt                          i,nz, nzLower, offset, rowOffset;
266:   PetscErrorCode                    ierr;

269:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
270:     try {
271:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272:       nzLower=n+ai[n]-ai[1];

274:       /* Allocate Space for the lower triangular matrix */
275:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
276:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
277:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);

279:       /* Fill the lower triangular matrix */
280:       AiLo[0]  = (PetscInt) 0;
281:       AiLo[n]  = nzLower;
282:       AjLo[0]  = (PetscInt) 0;
283:       AALo[0]  = (MatScalar) 1.0;
284:       v        = aa;
285:       vi       = aj;
286:       offset   = 1;
287:       rowOffset= 1;
288:       for (i=1; i<n; i++) {
289:         nz = ai[i+1] - ai[i];
290:         /* additional 1 for the term on the diagonal */
291:         AiLo[i]    = rowOffset;
292:         rowOffset += nz+1;

294:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
295:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

297:         offset      += nz;
298:         AjLo[offset] = (PetscInt) i;
299:         AALo[offset] = (MatScalar) 1.0;
300:         offset      += 1;

302:         v  += nz;
303:         vi += nz;
304:       }

306:       /* allocate space for the triangular factor information */
307:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

309:       /* Create the matrix description */
310:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
311:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
312:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
313:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
314:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);

316:       /* Create the solve analysis information */
317:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);

319:       /* set the operation */
320:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

322:       /* set the matrix */
323:       loTriFactor->csrMat = new CsrMatrix;
324:       loTriFactor->csrMat->num_rows = n;
325:       loTriFactor->csrMat->num_cols = n;
326:       loTriFactor->csrMat->num_entries = nzLower;

328:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
329:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

331:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

334:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
335:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

337:       /* perform the solve analysis */
338:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
339:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
340:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
341:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);

343:       /* assign the pointer. Is this really necessary? */
344:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

346:       cudaFreeHost(AiLo);CHKERRCUSP(ierr);
347:       cudaFreeHost(AjLo);CHKERRCUSP(ierr);
348:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
349:     } catch(char *ex) {
350:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
351:     }
352:   }
353:   return(0);
354: }

358: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
359: {
360:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
361:   PetscInt                          n = A->rmap->n;
362:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
363:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
364:   cusparseStatus_t                  stat;
365:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
366:   const MatScalar                   *aa = a->a,*v;
367:   PetscInt                          *AiUp, *AjUp;
368:   PetscScalar                       *AAUp;
369:   PetscInt                          i,nz, nzUpper, offset;
370:   PetscErrorCode                    ierr;

373:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
374:     try {
375:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
376:       nzUpper = adiag[0]-adiag[n];

378:       /* Allocate Space for the upper triangular matrix */
379:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
380:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
381:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

383:       /* Fill the upper triangular matrix */
384:       AiUp[0]=(PetscInt) 0;
385:       AiUp[n]=nzUpper;
386:       offset = nzUpper;
387:       for (i=n-1; i>=0; i--) {
388:         v  = aa + adiag[i+1] + 1;
389:         vi = aj + adiag[i+1] + 1;

391:         /* number of elements NOT on the diagonal */
392:         nz = adiag[i] - adiag[i+1]-1;

394:         /* decrement the offset */
395:         offset -= (nz+1);

397:         /* first, set the diagonal elements */
398:         AjUp[offset] = (PetscInt) i;
399:         AAUp[offset] = 1./v[nz];
400:         AiUp[i]      = AiUp[i+1] - (nz+1);

402:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
403:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
404:       }

406:       /* allocate space for the triangular factor information */
407:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

409:       /* Create the matrix description */
410:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
411:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
412:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
413:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
414:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);

416:       /* Create the solve analysis information */
417:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);

419:       /* set the operation */
420:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

422:       /* set the matrix */
423:       upTriFactor->csrMat = new CsrMatrix;
424:       upTriFactor->csrMat->num_rows = n;
425:       upTriFactor->csrMat->num_cols = n;
426:       upTriFactor->csrMat->num_entries = nzUpper;

428:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

431:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
432:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

434:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
435:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

437:       /* perform the solve analysis */
438:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
439:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
440:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
441:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);

443:       /* assign the pointer. Is this really necessary? */
444:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

446:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
447:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
448:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
449:     } catch(char *ex) {
450:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451:     }
452:   }
453:   return(0);
454: }

458: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
459: {
460:   PetscErrorCode               ierr;
461:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
462:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
463:   IS                           isrow = a->row,iscol = a->icol;
464:   PetscBool                    row_identity,col_identity;
465:   const PetscInt               *r,*c;
466:   PetscInt                     n = A->rmap->n;

469:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
470:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

472:   cusparseTriFactors->workVector = new THRUSTARRAY;
473:   cusparseTriFactors->workVector->resize(n);
474:   cusparseTriFactors->nnz=a->nz;

476:   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
477:   /*lower triangular indices */
478:   ISGetIndices(isrow,&r);
479:   ISIdentity(isrow,&row_identity);
480:   if (!row_identity) {
481:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
482:     cusparseTriFactors->rpermIndices->assign(r, r+n);
483:   }
484:   ISRestoreIndices(isrow,&r);

486:   /*upper triangular indices */
487:   ISGetIndices(iscol,&c);
488:   ISIdentity(iscol,&col_identity);
489:   if (!col_identity) {
490:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
491:     cusparseTriFactors->cpermIndices->assign(c, c+n);
492:   }
493:   ISRestoreIndices(iscol,&c);
494:   return(0);
495: }

499: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
500: {
501:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
502:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
503:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
504:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
505:   cusparseStatus_t                  stat;
506:   PetscErrorCode                    ierr;
507:   PetscInt                          *AiUp, *AjUp;
508:   PetscScalar                       *AAUp;
509:   PetscScalar                       *AALo;
510:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
511:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
512:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
513:   const MatScalar                   *aa = b->a,*v;

516:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
517:     try {
518:       /* Allocate Space for the upper triangular matrix */
519:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
520:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
521:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
522:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

524:       /* Fill the upper triangular matrix */
525:       AiUp[0]=(PetscInt) 0;
526:       AiUp[n]=nzUpper;
527:       offset = 0;
528:       for (i=0; i<n; i++) {
529:         /* set the pointers */
530:         v  = aa + ai[i];
531:         vj = aj + ai[i];
532:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

534:         /* first, set the diagonal elements */
535:         AjUp[offset] = (PetscInt) i;
536:         AAUp[offset] = 1.0/v[nz];
537:         AiUp[i]      = offset;
538:         AALo[offset] = 1.0/v[nz];

540:         offset+=1;
541:         if (nz>0) {
542:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
543:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
544:           for (j=offset; j<offset+nz; j++) {
545:             AAUp[j] = -AAUp[j];
546:             AALo[j] = AAUp[j]/v[nz];
547:           }
548:           offset+=nz;
549:         }
550:       }

552:       /* allocate space for the triangular factor information */
553:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

555:       /* Create the matrix description */
556:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
557:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
558:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
559:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
560:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);

562:       /* Create the solve analysis information */
563:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);

565:       /* set the operation */
566:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

568:       /* set the matrix */
569:       upTriFactor->csrMat = new CsrMatrix;
570:       upTriFactor->csrMat->num_rows = A->rmap->n;
571:       upTriFactor->csrMat->num_cols = A->cmap->n;
572:       upTriFactor->csrMat->num_entries = a->nz;

574:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
575:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

577:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
578:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

580:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
581:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

583:       /* perform the solve analysis */
584:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
585:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);

589:       /* assign the pointer. Is this really necessary? */
590:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

592:       /* allocate space for the triangular factor information */
593:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

595:       /* Create the matrix description */
596:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
597:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
598:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
599:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
600:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);

602:       /* Create the solve analysis information */
603:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);

605:       /* set the operation */
606:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

608:       /* set the matrix */
609:       loTriFactor->csrMat = new CsrMatrix;
610:       loTriFactor->csrMat->num_rows = A->rmap->n;
611:       loTriFactor->csrMat->num_cols = A->cmap->n;
612:       loTriFactor->csrMat->num_entries = a->nz;

614:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
615:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

617:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
618:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

620:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
621:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

623:       /* perform the solve analysis */
624:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
625:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
626:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
627:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);

629:       /* assign the pointer. Is this really necessary? */
630:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

632:       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
633:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
634:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
635:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
636:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
637:     } catch(char *ex) {
638:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
639:     }
640:   }
641:   return(0);
642: }

646: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
647: {
648:   PetscErrorCode               ierr;
649:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
650:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
651:   IS                           ip = a->row;
652:   const PetscInt               *rip;
653:   PetscBool                    perm_identity;
654:   PetscInt                     n = A->rmap->n;

657:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
658:   cusparseTriFactors->workVector = new THRUSTARRAY;
659:   cusparseTriFactors->workVector->resize(n);
660:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

662:   /*lower triangular indices */
663:   ISGetIndices(ip,&rip);
664:   ISIdentity(ip,&perm_identity);
665:   if (!perm_identity) {
666:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
667:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
668:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
670:   }
671:   ISRestoreIndices(ip,&rip);
672:   return(0);
673: }

677: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
678: {
679:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
680:   IS             isrow = b->row,iscol = b->col;
681:   PetscBool      row_identity,col_identity;

685:   MatLUFactorNumeric_SeqAIJ(B,A,info);
686:   /* determine which version of MatSolve needs to be used. */
687:   ISIdentity(isrow,&row_identity);
688:   ISIdentity(iscol,&col_identity);
689:   if (row_identity && col_identity) {
690:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
691:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
692:   } else {
693:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695:   }

697:   /* get the triangular factors */
698:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
699:   return(0);
700: }

704: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705: {
706:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707:   IS             ip = b->row;
708:   PetscBool      perm_identity;

712:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

714:   /* determine which version of MatSolve needs to be used. */
715:   ISIdentity(ip,&perm_identity);
716:   if (perm_identity) {
717:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719:   } else {
720:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
721:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
722:   }

724:   /* get the triangular factors */
725:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
726:   return(0);
727: }

731: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
732: {
733:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
734:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
735:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
736:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
737:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
738:   cusparseStatus_t                  stat;
739:   cusparseIndexBase_t               indexBase;
740:   cusparseMatrixType_t              matrixType;
741:   cusparseFillMode_t                fillMode;
742:   cusparseDiagType_t                diagType;


746:   /*********************************************/
747:   /* Now the Transpose of the Lower Tri Factor */
748:   /*********************************************/

750:   /* allocate space for the transpose of the lower triangular factor */
751:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

753:   /* set the matrix descriptors of the lower triangular factor */
754:   matrixType = cusparseGetMatType(loTriFactor->descr);
755:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
756:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
757:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
758:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

760:   /* Create the matrix description */
761:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
762:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
763:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
764:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
765:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);

767:   /* Create the solve analysis information */
768:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);

770:   /* set the operation */
771:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

773:   /* allocate GPU space for the CSC of the lower triangular factor*/
774:   loTriFactorT->csrMat = new CsrMatrix;
775:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
776:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
777:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
778:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
779:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
780:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

782:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
783:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
784:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
785:                           loTriFactor->csrMat->values->data().get(),
786:                           loTriFactor->csrMat->row_offsets->data().get(),
787:                           loTriFactor->csrMat->column_indices->data().get(),
788:                           loTriFactorT->csrMat->values->data().get(),
789:                           loTriFactorT->csrMat->column_indices->data().get(),
790:                           loTriFactorT->csrMat->row_offsets->data().get(),
791:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

793:   /* perform the solve analysis on the transposed matrix */
794:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
795:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
796:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
797:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
798:                            loTriFactorT->solveInfo);CHKERRCUSP(stat);

800:   /* assign the pointer. Is this really necessary? */
801:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

803:   /*********************************************/
804:   /* Now the Transpose of the Upper Tri Factor */
805:   /*********************************************/

807:   /* allocate space for the transpose of the upper triangular factor */
808:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

810:   /* set the matrix descriptors of the upper triangular factor */
811:   matrixType = cusparseGetMatType(upTriFactor->descr);
812:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
813:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
814:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
815:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

817:   /* Create the matrix description */
818:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
819:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
820:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
821:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
822:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);

824:   /* Create the solve analysis information */
825:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);

827:   /* set the operation */
828:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

830:   /* allocate GPU space for the CSC of the upper triangular factor*/
831:   upTriFactorT->csrMat = new CsrMatrix;
832:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
833:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
834:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
835:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
836:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
837:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

839:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
840:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
841:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
842:                           upTriFactor->csrMat->values->data().get(),
843:                           upTriFactor->csrMat->row_offsets->data().get(),
844:                           upTriFactor->csrMat->column_indices->data().get(),
845:                           upTriFactorT->csrMat->values->data().get(),
846:                           upTriFactorT->csrMat->column_indices->data().get(),
847:                           upTriFactorT->csrMat->row_offsets->data().get(),
848:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

850:   /* perform the solve analysis on the transposed matrix */
851:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
852:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
853:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
854:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
855:                            upTriFactorT->solveInfo);CHKERRCUSP(stat);

857:   /* assign the pointer. Is this really necessary? */
858:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
859:   return(0);
860: }

864: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865: {
866:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870:   cusparseStatus_t             stat;
871:   cusparseIndexBase_t          indexBase;
872:   cudaError_t                  err;


876:   /* allocate space for the triangular factor information */
877:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
878:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
879:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
880:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
881:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);

883:   /* set alpha and beta */
884:   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
885:   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
886:   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
887:   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
888:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);

890:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892:     CsrMatrix *matrixT= new CsrMatrix;
893:     matrixT->num_rows = A->rmap->n;
894:     matrixT->num_cols = A->cmap->n;
895:     matrixT->num_entries = a->nz;
896:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
897:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898:     matrixT->values = new THRUSTARRAY(a->nz);

900:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
901:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
902:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
903:                             matrix->num_cols, matrix->num_entries,
904:                             matrix->values->data().get(),
905:                             matrix->row_offsets->data().get(),
906:                             matrix->column_indices->data().get(),
907:                             matrixT->values->data().get(),
908:                             matrixT->column_indices->data().get(),
909:                             matrixT->row_offsets->data().get(),
910:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

912:     /* assign the pointer */
913:     matstructT->mat = matrixT;

915:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
916: #if CUDA_VERSION>=5000
917:     /* First convert HYB to CSR */
918:     CsrMatrix *temp= new CsrMatrix;
919:     temp->num_rows = A->rmap->n;
920:     temp->num_cols = A->cmap->n;
921:     temp->num_entries = a->nz;
922:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924:     temp->values = new THRUSTARRAY(a->nz);


927:     stat = cusparse_hyb2csr(cusparsestruct->handle,
928:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929:                             temp->values->data().get(),
930:                             temp->row_offsets->data().get(),
931:                             temp->column_indices->data().get());CHKERRCUSP(stat);

933:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934:     CsrMatrix *tempT= new CsrMatrix;
935:     tempT->num_rows = A->rmap->n;
936:     tempT->num_cols = A->cmap->n;
937:     tempT->num_entries = a->nz;
938:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940:     tempT->values = new THRUSTARRAY(a->nz);

942:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943:                             temp->num_cols, temp->num_entries,
944:                             temp->values->data().get(),
945:                             temp->row_offsets->data().get(),
946:                             temp->column_indices->data().get(),
947:                             tempT->values->data().get(),
948:                             tempT->column_indices->data().get(),
949:                             tempT->row_offsets->data().get(),
950:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

952:     /* Last, convert CSC to HYB */
953:     cusparseHybMat_t hybMat;
954:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
955:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958:                             matstructT->descr, tempT->values->data().get(),
959:                             tempT->row_offsets->data().get(),
960:                             tempT->column_indices->data().get(),
961:                             hybMat, 0, partition);CHKERRCUSP(stat);

963:     /* assign the pointer */
964:     matstructT->mat = hybMat;

966:     /* delete temporaries */
967:     if (tempT) {
968:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
969:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
970:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
971:       delete (CsrMatrix*) tempT;
972:     }
973:     if (temp) {
974:       if (temp->values) delete (THRUSTARRAY*) temp->values;
975:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
976:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
977:       delete (CsrMatrix*) temp;
978:     }
979: #else
980:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
981: #endif
982:   }
983:   /* assign the compressed row indices */
984:   matstructT->cprowIndices = new THRUSTINTARRAY;

986:   /* assign the pointer */
987:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988:   return(0);
989: }

993: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
994: {
995:   CUSPARRAY                         *xGPU, *bGPU;
996:   cusparseStatus_t                  stat;
997:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
999:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1000:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1001:   PetscErrorCode                    ierr;

1004:   /* Analyze the matrix and create the transpose ... on the fly */
1005:   if (!loTriFactorT && !upTriFactorT) {
1006:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1007:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009:   }

1011:   /* Get the GPU pointers */
1012:   VecCUSPGetArrayWrite(xx,&xGPU);
1013:   VecCUSPGetArrayRead(bb,&bGPU);

1015:   /* First, reorder with the row permutation */
1016:   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1017:                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1018:                xGPU->begin());

1020:   /* First, solve U */
1021:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1022:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1023:                         upTriFactorT->csrMat->values->data().get(),
1024:                         upTriFactorT->csrMat->row_offsets->data().get(),
1025:                         upTriFactorT->csrMat->column_indices->data().get(),
1026:                         upTriFactorT->solveInfo,
1027:                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1029:   /* Then, solve L */
1030:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1031:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1032:                         loTriFactorT->csrMat->values->data().get(),
1033:                         loTriFactorT->csrMat->row_offsets->data().get(),
1034:                         loTriFactorT->csrMat->column_indices->data().get(),
1035:                         loTriFactorT->solveInfo,
1036:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1038:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1039:   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1040:                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1041:                tempGPU->begin());

1043:   /* Copy the temporary to the full solution. */
1044:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());

1046:   /* restore */
1047:   VecCUSPRestoreArrayRead(bb,&bGPU);
1048:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1049:   WaitForGPU();CHKERRCUSP(ierr);

1051:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1052:   return(0);
1053: }

1057: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058: {
1059:   CUSPARRAY                         *xGPU,*bGPU;
1060:   cusparseStatus_t                  stat;
1061:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1062:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1063:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1064:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1065:   PetscErrorCode                    ierr;

1068:   /* Analyze the matrix and create the transpose ... on the fly */
1069:   if (!loTriFactorT && !upTriFactorT) {
1070:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1071:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1072:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1073:   }

1075:   /* Get the GPU pointers */
1076:   VecCUSPGetArrayWrite(xx,&xGPU);
1077:   VecCUSPGetArrayRead(bb,&bGPU);

1079:   /* First, solve U */
1080:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1081:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1082:                         upTriFactorT->csrMat->values->data().get(),
1083:                         upTriFactorT->csrMat->row_offsets->data().get(),
1084:                         upTriFactorT->csrMat->column_indices->data().get(),
1085:                         upTriFactorT->solveInfo,
1086:                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1088:   /* Then, solve L */
1089:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1091:                         loTriFactorT->csrMat->values->data().get(),
1092:                         loTriFactorT->csrMat->row_offsets->data().get(),
1093:                         loTriFactorT->csrMat->column_indices->data().get(),
1094:                         loTriFactorT->solveInfo,
1095:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1097:   /* restore */
1098:   VecCUSPRestoreArrayRead(bb,&bGPU);
1099:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1100:   WaitForGPU();CHKERRCUSP(ierr);
1101:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1102:   return(0);
1103: }

1107: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1108: {
1109:   CUSPARRAY                         *xGPU,*bGPU;
1110:   cusparseStatus_t                  stat;
1111:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1113:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1114:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115:   PetscErrorCode                    ierr;

1118:   /* Get the GPU pointers */
1119:   VecCUSPGetArrayWrite(xx,&xGPU);
1120:   VecCUSPGetArrayRead(bb,&bGPU);

1122:   /* First, reorder with the row permutation */
1123:   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1124:                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1125:                xGPU->begin());

1127:   /* Next, solve L */
1128:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1129:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1130:                         loTriFactor->csrMat->values->data().get(),
1131:                         loTriFactor->csrMat->row_offsets->data().get(),
1132:                         loTriFactor->csrMat->column_indices->data().get(),
1133:                         loTriFactor->solveInfo,
1134:                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1136:   /* Then, solve U */
1137:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1138:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1139:                         upTriFactor->csrMat->values->data().get(),
1140:                         upTriFactor->csrMat->row_offsets->data().get(),
1141:                         upTriFactor->csrMat->column_indices->data().get(),
1142:                         upTriFactor->solveInfo,
1143:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1145:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1146:   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1147:                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1148:                tempGPU->begin());

1150:   /* Copy the temporary to the full solution. */
1151:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());

1153:   VecCUSPRestoreArrayRead(bb,&bGPU);
1154:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1155:   WaitForGPU();CHKERRCUSP(ierr);
1156:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1157:   return(0);
1158: }

1162: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1163: {
1164:   CUSPARRAY                         *xGPU,*bGPU;
1165:   cusparseStatus_t                  stat;
1166:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1167:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1168:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1169:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1170:   PetscErrorCode                    ierr;

1173:   /* Get the GPU pointers */
1174:   VecCUSPGetArrayWrite(xx,&xGPU);
1175:   VecCUSPGetArrayRead(bb,&bGPU);

1177:   /* First, solve L */
1178:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1179:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1180:                         loTriFactor->csrMat->values->data().get(),
1181:                         loTriFactor->csrMat->row_offsets->data().get(),
1182:                         loTriFactor->csrMat->column_indices->data().get(),
1183:                         loTriFactor->solveInfo,
1184:                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1186:   /* Next, solve U */
1187:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1188:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1189:                         upTriFactor->csrMat->values->data().get(),
1190:                         upTriFactor->csrMat->row_offsets->data().get(),
1191:                         upTriFactor->csrMat->column_indices->data().get(),
1192:                         upTriFactor->solveInfo,
1193:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1195:   VecCUSPRestoreArrayRead(bb,&bGPU);
1196:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1197:   WaitForGPU();CHKERRCUSP(ierr);
1198:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1199:   return(0);
1200: }

1204: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1205: {

1207:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1209:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1211:   PetscErrorCode               ierr;
1212:   cusparseStatus_t             stat;
1213:   cudaError_t                  err;

1216:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1217:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1218:     if (matstruct) {
1219:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1220:     }
1221:     try {
1222:       cusparsestruct->nonzerorow=0;
1223:       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1225:       if (a->compressedrow.use) {
1226:         m    = a->compressedrow.nrows;
1227:         ii   = a->compressedrow.i;
1228:         ridx = a->compressedrow.rindex;
1229:       } else {
1230:         /* Forcing compressed row on the GPU */
1231:         int k=0;
1232:         PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);
1233:         PetscMalloc1((cusparsestruct->nonzerorow), &ridx);
1234:         ii[0]=0;
1235:         for (int j = 0; j<m; j++) {
1236:           if ((a->i[j+1]-a->i[j])>0) {
1237:             ii[k]  = a->i[j];
1238:             ridx[k]= j;
1239:             k++;
1240:           }
1241:         }
1242:         ii[cusparsestruct->nonzerorow] = a->nz;
1243:         m = cusparsestruct->nonzerorow;
1244:       }

1246:       /* allocate space for the triangular factor information */
1247:       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1248:       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1249:       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1250:       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);

1252:       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1253:       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1254:       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1255:       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1256:       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);

1258:       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1259:       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1260:         /* set the matrix */
1261:         CsrMatrix *matrix= new CsrMatrix;
1262:         matrix->num_rows = m;
1263:         matrix->num_cols = A->cmap->n;
1264:         matrix->num_entries = a->nz;
1265:         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1266:         matrix->row_offsets->assign(ii, ii + m+1);

1268:         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1269:         matrix->column_indices->assign(a->j, a->j+a->nz);

1271:         matrix->values = new THRUSTARRAY(a->nz);
1272:         matrix->values->assign(a->a, a->a+a->nz);

1274:         /* assign the pointer */
1275:         matstruct->mat = matrix;

1277:       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1278: #if CUDA_VERSION>=4020
1279:         CsrMatrix *matrix= new CsrMatrix;
1280:         matrix->num_rows = m;
1281:         matrix->num_cols = A->cmap->n;
1282:         matrix->num_entries = a->nz;
1283:         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1284:         matrix->row_offsets->assign(ii, ii + m+1);

1286:         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1287:         matrix->column_indices->assign(a->j, a->j+a->nz);

1289:         matrix->values = new THRUSTARRAY(a->nz);
1290:         matrix->values->assign(a->a, a->a+a->nz);

1292:         cusparseHybMat_t hybMat;
1293:         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1294:         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295:           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296:         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1297:                                 matstruct->descr, matrix->values->data().get(),
1298:                                 matrix->row_offsets->data().get(),
1299:                                 matrix->column_indices->data().get(),
1300:                                 hybMat, 0, partition);CHKERRCUSP(stat);
1301:         /* assign the pointer */
1302:         matstruct->mat = hybMat;

1304:         if (matrix) {
1305:           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1306:           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1307:           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1308:           delete (CsrMatrix*)matrix;
1309:         }
1310: #endif
1311:       }

1313:       /* assign the compressed row indices */
1314:       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1315:       matstruct->cprowIndices->assign(ridx,ridx+m);

1317:       /* assign the pointer */
1318:       cusparsestruct->mat = matstruct;

1320:       if (!a->compressedrow.use) {
1321:         PetscFree(ii);
1322:         PetscFree(ridx);
1323:       }
1324:       cusparsestruct->workVector = new THRUSTARRAY;
1325:       cusparsestruct->workVector->resize(m);
1326:     } catch(char *ex) {
1327:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1328:     }
1329:     WaitForGPU();CHKERRCUSP(ierr);

1331:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;

1333:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1334:   }
1335:   return(0);
1336: }

1340: static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1341: {
1343:   PetscInt rbs,cbs;

1346:   MatGetBlockSizes(mat,&rbs,&cbs);
1347:   if (right) {
1348:     VecCreate(PetscObjectComm((PetscObject)mat),right);
1349:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1350:     VecSetBlockSize(*right,cbs);
1351:     VecSetType(*right,VECSEQCUSP);
1352:     PetscLayoutReference(mat->cmap,&(*right)->map);
1353:   }
1354:   if (left) {
1355:     VecCreate(PetscObjectComm((PetscObject)mat),left);
1356:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1357:     VecSetBlockSize(*left,rbs);
1358:     VecSetType(*left,VECSEQCUSP);
1359:     PetscLayoutReference(mat->rmap,&(*left)->map);
1360:   }
1361:   return(0);
1362: }

1364: struct VecCUSPPlusEquals
1365: {
1366:   template <typename Tuple>
1367:   __host__ __device__
1368:   void operator()(Tuple t)
1369:   {
1370:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1371:   }
1372: };

1376: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1377: {
1378:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1379:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1380:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1381:   CUSPARRAY                    *xarray,*yarray;
1382:   PetscErrorCode               ierr;
1383:   cusparseStatus_t             stat;

1386:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1387:      MatSeqAIJCUSPARSECopyToGPU(A); */
1388:   VecCUSPGetArrayRead(xx,&xarray);
1389:   VecCUSPGetArrayWrite(yy,&yarray);
1390:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1391:     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1392:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1393:                              mat->num_rows, mat->num_cols, mat->num_entries,
1394:                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1395:                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1396:                              yarray->data().get());CHKERRCUSP(stat);
1397:   } else {
1398: #if CUDA_VERSION>=4020
1399:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1400:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1401:                              matstruct->alpha, matstruct->descr, hybMat,
1402:                              xarray->data().get(), matstruct->beta,
1403:                              yarray->data().get());CHKERRCUSP(stat);
1404: #endif
1405:   }
1406:   VecCUSPRestoreArrayRead(xx,&xarray);
1407:   VecCUSPRestoreArrayWrite(yy,&yarray);
1408:   if (!cusparsestruct->stream) {
1409:     WaitForGPU();CHKERRCUSP(ierr);
1410:   }
1411:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1412:   return(0);
1413: }

1417: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1418: {
1419:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1420:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1421:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1422:   CUSPARRAY                    *xarray,*yarray;
1423:   PetscErrorCode               ierr;
1424:   cusparseStatus_t             stat;

1427:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1428:      MatSeqAIJCUSPARSECopyToGPU(A); */
1429:   if (!matstructT) {
1430:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1431:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1432:   }
1433:   VecCUSPGetArrayRead(xx,&xarray);
1434:   VecCUSPGetArrayWrite(yy,&yarray);

1436:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1437:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1438:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1439:                              mat->num_rows, mat->num_cols,
1440:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1441:                              mat->values->data().get(), mat->row_offsets->data().get(),
1442:                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1443:                              yarray->data().get());CHKERRCUSP(stat);
1444:   } else {
1445: #if CUDA_VERSION>=4020
1446:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1447:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1448:                              matstructT->alpha, matstructT->descr, hybMat,
1449:                              xarray->data().get(), matstructT->beta,
1450:                              yarray->data().get());CHKERRCUSP(stat);
1451: #endif
1452:   }
1453:   VecCUSPRestoreArrayRead(xx,&xarray);
1454:   VecCUSPRestoreArrayWrite(yy,&yarray);
1455:   if (!cusparsestruct->stream) {
1456:     WaitForGPU();CHKERRCUSP(ierr);
1457:   }
1458:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1459:   return(0);
1460: }


1465: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1466: {
1467:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1468:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1469:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1470:   CUSPARRAY                    *xarray,*yarray,*zarray;
1471:   PetscErrorCode               ierr;
1472:   cusparseStatus_t             stat;

1475:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1476:      MatSeqAIJCUSPARSECopyToGPU(A); */
1477:   try {
1478:     VecCopy_SeqCUSP(yy,zz);
1479:     VecCUSPGetArrayRead(xx,&xarray);
1480:     VecCUSPGetArrayRead(yy,&yarray);
1481:     VecCUSPGetArrayWrite(zz,&zarray);

1483:     /* multiply add */
1484:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1485:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1486:       /* here we need to be careful to set the number of rows in the multiply to the
1487:          number of compressed rows in the matrix ... which is equivalent to the
1488:          size of the workVector */
1489:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1490:                                mat->num_rows, mat->num_cols,
1491:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1492:                                mat->values->data().get(), mat->row_offsets->data().get(),
1493:                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1494:                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1495:     } else {
1496: #if CUDA_VERSION>=4020
1497:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1498:       if (cusparsestruct->workVector->size()) {
1499:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1500:                                  matstruct->alpha, matstruct->descr, hybMat,
1501:                                  xarray->data().get(), matstruct->beta,
1502:                                  cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1503:       }
1504: #endif
1505:     }

1507:     /* scatter the data from the temporary into the full vector with a += operation */
1508:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1509:                      thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1510:                      VecCUSPPlusEquals());
1511:     VecCUSPRestoreArrayRead(xx,&xarray);
1512:     VecCUSPRestoreArrayRead(yy,&yarray);
1513:     VecCUSPRestoreArrayWrite(zz,&zarray);

1515:   } catch(char *ex) {
1516:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1517:   }
1518:   WaitForGPU();CHKERRCUSP(ierr);
1519:   PetscLogFlops(2.0*a->nz);
1520:   return(0);
1521: }

1525: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1526: {
1527:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1528:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1529:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1530:   CUSPARRAY                    *xarray,*yarray,*zarray;
1531:   PetscErrorCode               ierr;
1532:   cusparseStatus_t             stat;

1535:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1536:      MatSeqAIJCUSPARSECopyToGPU(A); */
1537:   if (!matstructT) {
1538:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1539:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1540:   }

1542:   try {
1543:     VecCopy_SeqCUSP(yy,zz);
1544:     VecCUSPGetArrayRead(xx,&xarray);
1545:     VecCUSPGetArrayRead(yy,&yarray);
1546:     VecCUSPGetArrayWrite(zz,&zarray);

1548:     /* multiply add with matrix transpose */
1549:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1550:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1551:       /* here we need to be careful to set the number of rows in the multiply to the
1552:          number of compressed rows in the matrix ... which is equivalent to the
1553:          size of the workVector */
1554:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1555:                                mat->num_rows, mat->num_cols,
1556:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1557:                                mat->values->data().get(), mat->row_offsets->data().get(),
1558:                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1559:                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1560:     } else {
1561: #if CUDA_VERSION>=4020
1562:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1563:       if (cusparsestruct->workVector->size()) {
1564:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1565:                                  matstructT->alpha, matstructT->descr, hybMat,
1566:                                  xarray->data().get(), matstructT->beta,
1567:                                  cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1568:       }
1569: #endif
1570:     }

1572:     /* scatter the data from the temporary into the full vector with a += operation */
1573:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1574:                      thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1575:                      VecCUSPPlusEquals());

1577:     VecCUSPRestoreArrayRead(xx,&xarray);
1578:     VecCUSPRestoreArrayRead(yy,&yarray);
1579:     VecCUSPRestoreArrayWrite(zz,&zarray);

1581:   } catch(char *ex) {
1582:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1583:   }
1584:   WaitForGPU();CHKERRCUSP(ierr);
1585:   PetscLogFlops(2.0*a->nz);
1586:   return(0);
1587: }

1591: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1592: {

1596:   MatAssemblyEnd_SeqAIJ(A,mode);
1597:   if (A->factortype==MAT_FACTOR_NONE) {
1598:     MatSeqAIJCUSPARSECopyToGPU(A);
1599:   }
1600:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1601:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1602:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1603:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1604:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1605:   return(0);
1606: }

1608: /* --------------------------------------------------------------------------------*/
1611: /*@
1612:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1613:    (the default parallel PETSc format). This matrix will ultimately pushed down
1614:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1615:    assembly performance the user should preallocate the matrix storage by setting
1616:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1617:    performance during matrix assembly can be increased by more than a factor of 50.

1619:    Collective on MPI_Comm

1621:    Input Parameters:
1622: +  comm - MPI communicator, set to PETSC_COMM_SELF
1623: .  m - number of rows
1624: .  n - number of columns
1625: .  nz - number of nonzeros per row (same for all rows)
1626: -  nnz - array containing the number of nonzeros in the various rows
1627:          (possibly different for each row) or NULL

1629:    Output Parameter:
1630: .  A - the matrix

1632:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1633:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1634:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1636:    Notes:
1637:    If nnz is given then nz is ignored

1639:    The AIJ format (also called the Yale sparse matrix format or
1640:    compressed row storage), is fully compatible with standard Fortran 77
1641:    storage.  That is, the stored row and column indices can begin at
1642:    either one (as in Fortran) or zero.  See the users' manual for details.

1644:    Specify the preallocated storage with either nz or nnz (not both).
1645:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1646:    allocation.  For large problems you MUST preallocate memory or you
1647:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1649:    By default, this format uses inodes (identical nodes) when possible, to
1650:    improve numerical efficiency of matrix-vector products and solves. We
1651:    search for consecutive rows with the same nonzero structure, thereby
1652:    reusing matrix information to achieve increased efficiency.

1654:    Level: intermediate

1656: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1657: @*/
1658: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1659: {

1663:   MatCreate(comm,A);
1664:   MatSetSizes(*A,m,n,m,n);
1665:   MatSetType(*A,MATSEQAIJCUSPARSE);
1666:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1667:   return(0);
1668: }

1672: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1673: {
1674:   PetscErrorCode   ierr;
1675:   cusparseStatus_t stat;
1676:   cudaError_t      err;
1678:   if (A->factortype==MAT_FACTOR_NONE) {
1679:     try {
1680:       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1681:         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682:         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1683:         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;

1685:         /* delete all the members in each of the data structures */
1686:         if (matstruct) {
1687:           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1688:           if (matstruct->mat) {
1689:             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1690: #if CUDA_VERSION>=4020
1691:               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1692:               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1693: #endif
1694:             } else {
1695:               CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1696:               if (mat->values) delete (THRUSTARRAY*)mat->values;
1697:               if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1698:               if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1699:               delete (CsrMatrix*)matstruct->mat;
1700:             }
1701:           }
1702:           if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1703:           if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1704:           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1705:           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1706:         }
1707:         if (matstructT) {
1708:           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1709:           if (matstructT->mat) {
1710:             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1711: #if CUDA_VERSION>=4020
1712:               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1713:               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1714: #endif
1715:             } else {
1716:               CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1717:               if (matT->values) delete (THRUSTARRAY*)matT->values;
1718:               if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1719:               if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1720:               delete (CsrMatrix*)matstructT->mat;
1721:             }
1722:           }
1723:           if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1724:           if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1725:           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1726:           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1727:         }
1728:         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1729:         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1730:         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1731:         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1732:       } else {
1733:         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1734:         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1735:         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1736:       }
1737:     } catch(char *ex) {
1738:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1739:     }
1740:   } else {
1741:     /* The triangular factors */
1742:     try {
1743:       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1744:       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1745:       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1746:       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1747:       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;

1749:       /* delete all the members in each of the data structures */
1750:       if (loTriFactor) {
1751:         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1752:         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1753:         if (loTriFactor->csrMat) {
1754:           if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1755:           if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1756:           if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1757:           delete (CsrMatrix*)loTriFactor->csrMat;
1758:         }
1759:         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1760:       }

1762:       if (upTriFactor) {
1763:         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1764:         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1765:         if (upTriFactor->csrMat) {
1766:           if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1767:           if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1768:           if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1769:           delete (CsrMatrix*)upTriFactor->csrMat;
1770:         }
1771:         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1772:       }

1774:       if (loTriFactorT) {
1775:         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1776:         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1777:         if (loTriFactorT->csrMat) {
1778:           if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1779:           if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1780:           if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1781:           delete (CsrMatrix*)loTriFactorT->csrMat;
1782:         }
1783:         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1784:       }

1786:       if (upTriFactorT) {
1787:         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1788:         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1789:         if (upTriFactorT->csrMat) {
1790:           if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1791:           if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1792:           if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1793:           delete (CsrMatrix*)upTriFactorT->csrMat;
1794:         }
1795:         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1796:       }
1797:       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1798:       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1799:       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1800:       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1801:       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1802:     } catch(char *ex) {
1803:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1804:     }
1805:   }
1806:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1807:   A->spptr = 0;

1809:   MatDestroy_SeqAIJ(A);
1810:   return(0);
1811: }

1815: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1816: {
1818:   cusparseStatus_t stat;
1819:   cusparseHandle_t handle=0;

1822:   MatCreate_SeqAIJ(B);
1823:   if (B->factortype==MAT_FACTOR_NONE) {
1824:     /* you cannot check the inode.use flag here since the matrix was just created.
1825:        now build a GPU matrix data structure */
1826:     B->spptr = new Mat_SeqAIJCUSPARSE;
1827:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1828:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1829:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1830:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1831:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1832:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1833:     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1834:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1835:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1836:   } else {
1837:     /* NEXT, set the pointers to the triangular factors */
1838:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1839:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1840:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1841:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1842:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1843:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1844:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1845:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1846:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1847:     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1848:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1849:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1850:   }

1852:   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1853:      default cusparse tri solve. Note the difference with the implementation in
1854:      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1855:   PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);

1857:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1858:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1859:   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1860:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1861:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1862:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1863:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1864:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;

1866:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1868:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;

1870:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1871:   return(0);
1872: }

1874: /*M
1875:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1877:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1878:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1879:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1881:    Options Database Keys:
1882: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1883: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1884: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1886:   Level: beginner

1888: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1889: M*/