Actual source code: aijcusparse.cu

petsc-3.6.1 2015-08-06
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */

  6: #include <petscconf.h>
  7: #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
  8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/vecimpl.h>
 11: #undef VecType
 12: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 14: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 16: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 17: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 20: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 24: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 28: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat);
 29: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 31: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 32: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 34: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 35: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 36: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 37: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 38: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 42: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 43: {
 44:   cusparseStatus_t   stat;
 45:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 48:   cusparsestruct->stream = stream;
 49:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
 50:   return(0);
 51: }

 55: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 56: {
 57:   cusparseStatus_t   stat;
 58:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 61:   if (cusparsestruct->handle)
 62:     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
 63:   cusparsestruct->handle = handle;
 64:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
 65:   return(0);
 66: }

 70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 71: {
 72:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 74:   if (cusparsestruct->handle)
 75:     cusparsestruct->handle = 0;
 76:   return(0);
 77: }

 81: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
 82: {
 84:   *type = MATSOLVERCUSPARSE;
 85:   return(0);
 86: }

 88: /*MC
 89:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 90:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 91:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 92:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 93:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 94:   algorithms are not recommended. This class does NOT support direct solver operations.

 96:   Level: beginner

 98: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 99: M*/

103: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104: {
106:   PetscInt       n = A->rmap->n;

109:   MatCreate(PetscObjectComm((PetscObject)A),B);
110:   (*B)->factortype = ftype;
111:   MatSetSizes(*B,n,n,n,n);
112:   MatSetType(*B,MATSEQAIJCUSPARSE);

114:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115:     MatSetBlockSizesFromMats(*B,A,A);
116:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

123:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
124:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
125:   return(0);
126: }

130: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131: {
132:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

135: #if CUDA_VERSION>=4020
136:   switch (op) {
137:   case MAT_CUSPARSE_MULT:
138:     cusparsestruct->format = format;
139:     break;
140:   case MAT_CUSPARSE_ALL:
141:     cusparsestruct->format = format;
142:     break;
143:   default:
144:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145:   }
146: #else
147:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
148:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149: #endif
150:   return(0);
151: }

153: /*@
154:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155:    operation. Only the MatMult operation can use different GPU storage formats
156:    for MPIAIJCUSPARSE matrices.
157:    Not Collective

159:    Input Parameters:
160: +  A - Matrix of type SEQAIJCUSPARSE
161: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

164:    Output Parameter:

166:    Level: intermediate

168: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169: @*/
172: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173: {

178:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
179:   return(0);
180: }

184: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
185: {
186:   PetscErrorCode           ierr;
187:   MatCUSPARSEStorageFormat format;
188:   PetscBool                flg;
189:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

192:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
193:   PetscObjectOptionsBegin((PetscObject)A);
194:   if (A->factortype==MAT_FACTOR_NONE) {
195:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
196:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
197:     if (flg) {
198:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
199:     }
200:   }
201:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
202:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
203:   if (flg) {
204:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
205:   }
206:   PetscOptionsEnd();
207:   return(0);

209: }

213: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
214: {

218:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
219:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
220:   return(0);
221: }

225: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
226: {

230:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
231:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
232:   return(0);
233: }

237: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
238: {

242:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
243:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
244:   return(0);
245: }

249: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
250: {

254:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
255:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
256:   return(0);
257: }

261: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
264:   PetscInt                          n = A->rmap->n;
265:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
266:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
267:   cusparseStatus_t                  stat;
268:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
269:   const MatScalar                   *aa = a->a,*v;
270:   PetscInt                          *AiLo, *AjLo;
271:   PetscScalar                       *AALo;
272:   PetscInt                          i,nz, nzLower, offset, rowOffset;
273:   PetscErrorCode                    ierr;

276:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
277:     try {
278:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279:       nzLower=n+ai[n]-ai[1];

281:       /* Allocate Space for the lower triangular matrix */
282:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
283:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
284:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);

286:       /* Fill the lower triangular matrix */
287:       AiLo[0]  = (PetscInt) 0;
288:       AiLo[n]  = nzLower;
289:       AjLo[0]  = (PetscInt) 0;
290:       AALo[0]  = (MatScalar) 1.0;
291:       v        = aa;
292:       vi       = aj;
293:       offset   = 1;
294:       rowOffset= 1;
295:       for (i=1; i<n; i++) {
296:         nz = ai[i+1] - ai[i];
297:         /* additional 1 for the term on the diagonal */
298:         AiLo[i]    = rowOffset;
299:         rowOffset += nz+1;

301:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
302:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

304:         offset      += nz;
305:         AjLo[offset] = (PetscInt) i;
306:         AALo[offset] = (MatScalar) 1.0;
307:         offset      += 1;

309:         v  += nz;
310:         vi += nz;
311:       }

313:       /* allocate space for the triangular factor information */
314:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

316:       /* Create the matrix description */
317:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
318:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
319:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
320:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
321:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);

323:       /* Create the solve analysis information */
324:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);

326:       /* set the operation */
327:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

329:       /* set the matrix */
330:       loTriFactor->csrMat = new CsrMatrix;
331:       loTriFactor->csrMat->num_rows = n;
332:       loTriFactor->csrMat->num_cols = n;
333:       loTriFactor->csrMat->num_entries = nzLower;

335:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
336:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

338:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

341:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

344:       /* perform the solve analysis */
345:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
346:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
347:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
348:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);

350:       /* assign the pointer. Is this really necessary? */
351:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

353:       cudaFreeHost(AiLo);CHKERRCUSP(ierr);
354:       cudaFreeHost(AjLo);CHKERRCUSP(ierr);
355:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
356:     } catch(char *ex) {
357:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
358:     }
359:   }
360:   return(0);
361: }

365: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
366: {
367:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368:   PetscInt                          n = A->rmap->n;
369:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
371:   cusparseStatus_t                  stat;
372:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
373:   const MatScalar                   *aa = a->a,*v;
374:   PetscInt                          *AiUp, *AjUp;
375:   PetscScalar                       *AAUp;
376:   PetscInt                          i,nz, nzUpper, offset;
377:   PetscErrorCode                    ierr;

380:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
381:     try {
382:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
383:       nzUpper = adiag[0]-adiag[n];

385:       /* Allocate Space for the upper triangular matrix */
386:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
387:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
388:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

390:       /* Fill the upper triangular matrix */
391:       AiUp[0]=(PetscInt) 0;
392:       AiUp[n]=nzUpper;
393:       offset = nzUpper;
394:       for (i=n-1; i>=0; i--) {
395:         v  = aa + adiag[i+1] + 1;
396:         vi = aj + adiag[i+1] + 1;

398:         /* number of elements NOT on the diagonal */
399:         nz = adiag[i] - adiag[i+1]-1;

401:         /* decrement the offset */
402:         offset -= (nz+1);

404:         /* first, set the diagonal elements */
405:         AjUp[offset] = (PetscInt) i;
406:         AAUp[offset] = 1./v[nz];
407:         AiUp[i]      = AiUp[i+1] - (nz+1);

409:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
410:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
411:       }

413:       /* allocate space for the triangular factor information */
414:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

416:       /* Create the matrix description */
417:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
418:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
419:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
420:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
421:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);

423:       /* Create the solve analysis information */
424:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);

426:       /* set the operation */
427:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

429:       /* set the matrix */
430:       upTriFactor->csrMat = new CsrMatrix;
431:       upTriFactor->csrMat->num_rows = n;
432:       upTriFactor->csrMat->num_cols = n;
433:       upTriFactor->csrMat->num_entries = nzUpper;

435:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
436:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

438:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
439:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

441:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

444:       /* perform the solve analysis */
445:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
446:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
447:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
448:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);

450:       /* assign the pointer. Is this really necessary? */
451:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

453:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
454:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
455:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
456:     } catch(char *ex) {
457:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
458:     }
459:   }
460:   return(0);
461: }

465: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
466: {
467:   PetscErrorCode               ierr;
468:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
469:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
470:   IS                           isrow = a->row,iscol = a->icol;
471:   PetscBool                    row_identity,col_identity;
472:   const PetscInt               *r,*c;
473:   PetscInt                     n = A->rmap->n;

476:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
477:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

479:   cusparseTriFactors->workVector = new THRUSTARRAY;
480:   cusparseTriFactors->workVector->resize(n);
481:   cusparseTriFactors->nnz=a->nz;

483:   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484:   /*lower triangular indices */
485:   ISGetIndices(isrow,&r);
486:   ISIdentity(isrow,&row_identity);
487:   if (!row_identity) {
488:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
489:     cusparseTriFactors->rpermIndices->assign(r, r+n);
490:   }
491:   ISRestoreIndices(isrow,&r);

493:   /*upper triangular indices */
494:   ISGetIndices(iscol,&c);
495:   ISIdentity(iscol,&col_identity);
496:   if (!col_identity) {
497:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
498:     cusparseTriFactors->cpermIndices->assign(c, c+n);
499:   }
500:   ISRestoreIndices(iscol,&c);
501:   return(0);
502: }

506: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
507: {
508:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
509:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
510:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
511:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
512:   cusparseStatus_t                  stat;
513:   PetscErrorCode                    ierr;
514:   PetscInt                          *AiUp, *AjUp;
515:   PetscScalar                       *AAUp;
516:   PetscScalar                       *AALo;
517:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
518:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
519:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
520:   const MatScalar                   *aa = b->a,*v;

523:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
524:     try {
525:       /* Allocate Space for the upper triangular matrix */
526:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
527:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
528:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

531:       /* Fill the upper triangular matrix */
532:       AiUp[0]=(PetscInt) 0;
533:       AiUp[n]=nzUpper;
534:       offset = 0;
535:       for (i=0; i<n; i++) {
536:         /* set the pointers */
537:         v  = aa + ai[i];
538:         vj = aj + ai[i];
539:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

541:         /* first, set the diagonal elements */
542:         AjUp[offset] = (PetscInt) i;
543:         AAUp[offset] = 1.0/v[nz];
544:         AiUp[i]      = offset;
545:         AALo[offset] = 1.0/v[nz];

547:         offset+=1;
548:         if (nz>0) {
549:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
550:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
551:           for (j=offset; j<offset+nz; j++) {
552:             AAUp[j] = -AAUp[j];
553:             AALo[j] = AAUp[j]/v[nz];
554:           }
555:           offset+=nz;
556:         }
557:       }

559:       /* allocate space for the triangular factor information */
560:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

562:       /* Create the matrix description */
563:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
564:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
565:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
566:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
567:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);

569:       /* Create the solve analysis information */
570:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);

572:       /* set the operation */
573:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

575:       /* set the matrix */
576:       upTriFactor->csrMat = new CsrMatrix;
577:       upTriFactor->csrMat->num_rows = A->rmap->n;
578:       upTriFactor->csrMat->num_cols = A->cmap->n;
579:       upTriFactor->csrMat->num_entries = a->nz;

581:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
582:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

584:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
585:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

587:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
588:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

590:       /* perform the solve analysis */
591:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
592:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
593:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
594:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);

596:       /* assign the pointer. Is this really necessary? */
597:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

599:       /* allocate space for the triangular factor information */
600:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

602:       /* Create the matrix description */
603:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
604:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
605:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
606:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
607:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);

609:       /* Create the solve analysis information */
610:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);

612:       /* set the operation */
613:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

615:       /* set the matrix */
616:       loTriFactor->csrMat = new CsrMatrix;
617:       loTriFactor->csrMat->num_rows = A->rmap->n;
618:       loTriFactor->csrMat->num_cols = A->cmap->n;
619:       loTriFactor->csrMat->num_entries = a->nz;

621:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
622:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

624:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
625:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

627:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
628:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

630:       /* perform the solve analysis */
631:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
632:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
633:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
634:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);

636:       /* assign the pointer. Is this really necessary? */
637:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

639:       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
640:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
641:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
642:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
643:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
644:     } catch(char *ex) {
645:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
646:     }
647:   }
648:   return(0);
649: }

653: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
654: {
655:   PetscErrorCode               ierr;
656:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
657:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
658:   IS                           ip = a->row;
659:   const PetscInt               *rip;
660:   PetscBool                    perm_identity;
661:   PetscInt                     n = A->rmap->n;

664:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
665:   cusparseTriFactors->workVector = new THRUSTARRAY;
666:   cusparseTriFactors->workVector->resize(n);
667:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

669:   /*lower triangular indices */
670:   ISGetIndices(ip,&rip);
671:   ISIdentity(ip,&perm_identity);
672:   if (!perm_identity) {
673:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
674:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
675:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
677:   }
678:   ISRestoreIndices(ip,&rip);
679:   return(0);
680: }

684: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
685: {
686:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
687:   IS             isrow = b->row,iscol = b->col;
688:   PetscBool      row_identity,col_identity;

692:   MatLUFactorNumeric_SeqAIJ(B,A,info);
693:   /* determine which version of MatSolve needs to be used. */
694:   ISIdentity(isrow,&row_identity);
695:   ISIdentity(iscol,&col_identity);
696:   if (row_identity && col_identity) {
697:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
698:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
699:   } else {
700:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
701:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
702:   }

704:   /* get the triangular factors */
705:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
706:   return(0);
707: }

711: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
712: {
713:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
714:   IS             ip = b->row;
715:   PetscBool      perm_identity;

719:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

721:   /* determine which version of MatSolve needs to be used. */
722:   ISIdentity(ip,&perm_identity);
723:   if (perm_identity) {
724:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
725:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
726:   } else {
727:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
728:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
729:   }

731:   /* get the triangular factors */
732:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
733:   return(0);
734: }

738: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
739: {
740:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
741:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
742:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
743:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
744:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
745:   cusparseStatus_t                  stat;
746:   cusparseIndexBase_t               indexBase;
747:   cusparseMatrixType_t              matrixType;
748:   cusparseFillMode_t                fillMode;
749:   cusparseDiagType_t                diagType;


753:   /*********************************************/
754:   /* Now the Transpose of the Lower Tri Factor */
755:   /*********************************************/

757:   /* allocate space for the transpose of the lower triangular factor */
758:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

760:   /* set the matrix descriptors of the lower triangular factor */
761:   matrixType = cusparseGetMatType(loTriFactor->descr);
762:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
763:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
764:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
765:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

767:   /* Create the matrix description */
768:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
769:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
770:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
771:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
772:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);

774:   /* Create the solve analysis information */
775:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);

777:   /* set the operation */
778:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

780:   /* allocate GPU space for the CSC of the lower triangular factor*/
781:   loTriFactorT->csrMat = new CsrMatrix;
782:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
783:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
784:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
785:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
786:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
787:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

789:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
790:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
791:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
792:                           loTriFactor->csrMat->values->data().get(),
793:                           loTriFactor->csrMat->row_offsets->data().get(),
794:                           loTriFactor->csrMat->column_indices->data().get(),
795:                           loTriFactorT->csrMat->values->data().get(),
796:                           loTriFactorT->csrMat->column_indices->data().get(),
797:                           loTriFactorT->csrMat->row_offsets->data().get(),
798:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

800:   /* perform the solve analysis on the transposed matrix */
801:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
802:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
803:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
804:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
805:                            loTriFactorT->solveInfo);CHKERRCUSP(stat);

807:   /* assign the pointer. Is this really necessary? */
808:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

810:   /*********************************************/
811:   /* Now the Transpose of the Upper Tri Factor */
812:   /*********************************************/

814:   /* allocate space for the transpose of the upper triangular factor */
815:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

817:   /* set the matrix descriptors of the upper triangular factor */
818:   matrixType = cusparseGetMatType(upTriFactor->descr);
819:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
820:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
821:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
822:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

824:   /* Create the matrix description */
825:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
826:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
827:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
828:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
829:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);

831:   /* Create the solve analysis information */
832:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);

834:   /* set the operation */
835:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

837:   /* allocate GPU space for the CSC of the upper triangular factor*/
838:   upTriFactorT->csrMat = new CsrMatrix;
839:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
840:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
841:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
842:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
843:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
844:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

846:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
847:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
848:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
849:                           upTriFactor->csrMat->values->data().get(),
850:                           upTriFactor->csrMat->row_offsets->data().get(),
851:                           upTriFactor->csrMat->column_indices->data().get(),
852:                           upTriFactorT->csrMat->values->data().get(),
853:                           upTriFactorT->csrMat->column_indices->data().get(),
854:                           upTriFactorT->csrMat->row_offsets->data().get(),
855:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

857:   /* perform the solve analysis on the transposed matrix */
858:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
859:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
860:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
861:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
862:                            upTriFactorT->solveInfo);CHKERRCUSP(stat);

864:   /* assign the pointer. Is this really necessary? */
865:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
866:   return(0);
867: }

871: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
872: {
873:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
874:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
875:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
876:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
877:   cusparseStatus_t             stat;
878:   cusparseIndexBase_t          indexBase;
879:   cudaError_t                  err;


883:   /* allocate space for the triangular factor information */
884:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
885:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
886:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
887:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
888:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);

890:   /* set alpha and beta */
891:   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
892:   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
893:   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
894:   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
895:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);

897:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
898:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
899:     CsrMatrix *matrixT= new CsrMatrix;
900:     matrixT->num_rows = A->rmap->n;
901:     matrixT->num_cols = A->cmap->n;
902:     matrixT->num_entries = a->nz;
903:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
905:     matrixT->values = new THRUSTARRAY(a->nz);

907:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
908:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
909:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
910:                             matrix->num_cols, matrix->num_entries,
911:                             matrix->values->data().get(),
912:                             matrix->row_offsets->data().get(),
913:                             matrix->column_indices->data().get(),
914:                             matrixT->values->data().get(),
915:                             matrixT->column_indices->data().get(),
916:                             matrixT->row_offsets->data().get(),
917:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

919:     /* assign the pointer */
920:     matstructT->mat = matrixT;

922:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
923: #if CUDA_VERSION>=5000
924:     /* First convert HYB to CSR */
925:     CsrMatrix *temp= new CsrMatrix;
926:     temp->num_rows = A->rmap->n;
927:     temp->num_cols = A->cmap->n;
928:     temp->num_entries = a->nz;
929:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
930:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
931:     temp->values = new THRUSTARRAY(a->nz);


934:     stat = cusparse_hyb2csr(cusparsestruct->handle,
935:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
936:                             temp->values->data().get(),
937:                             temp->row_offsets->data().get(),
938:                             temp->column_indices->data().get());CHKERRCUSP(stat);

940:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
941:     CsrMatrix *tempT= new CsrMatrix;
942:     tempT->num_rows = A->rmap->n;
943:     tempT->num_cols = A->cmap->n;
944:     tempT->num_entries = a->nz;
945:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
946:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
947:     tempT->values = new THRUSTARRAY(a->nz);

949:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
950:                             temp->num_cols, temp->num_entries,
951:                             temp->values->data().get(),
952:                             temp->row_offsets->data().get(),
953:                             temp->column_indices->data().get(),
954:                             tempT->values->data().get(),
955:                             tempT->column_indices->data().get(),
956:                             tempT->row_offsets->data().get(),
957:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);

959:     /* Last, convert CSC to HYB */
960:     cusparseHybMat_t hybMat;
961:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
962:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
963:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
964:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
965:                             matstructT->descr, tempT->values->data().get(),
966:                             tempT->row_offsets->data().get(),
967:                             tempT->column_indices->data().get(),
968:                             hybMat, 0, partition);CHKERRCUSP(stat);

970:     /* assign the pointer */
971:     matstructT->mat = hybMat;

973:     /* delete temporaries */
974:     if (tempT) {
975:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
976:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
977:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
978:       delete (CsrMatrix*) tempT;
979:     }
980:     if (temp) {
981:       if (temp->values) delete (THRUSTARRAY*) temp->values;
982:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
983:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
984:       delete (CsrMatrix*) temp;
985:     }
986: #else
987:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
988: #endif
989:   }
990:   /* assign the compressed row indices */
991:   matstructT->cprowIndices = new THRUSTINTARRAY;

993:   /* assign the pointer */
994:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995:   return(0);
996: }

1000: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001: {
1002:   CUSPARRAY                         *xGPU, *bGPU;
1003:   cusparseStatus_t                  stat;
1004:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1006:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1007:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1008:   PetscErrorCode                    ierr;

1011:   /* Analyze the matrix and create the transpose ... on the fly */
1012:   if (!loTriFactorT && !upTriFactorT) {
1013:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1014:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016:   }

1018:   /* Get the GPU pointers */
1019:   VecCUSPGetArrayWrite(xx,&xGPU);
1020:   VecCUSPGetArrayRead(bb,&bGPU);

1022:   /* First, reorder with the row permutation */
1023:   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1024:                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1025:                xGPU->begin());

1027:   /* First, solve U */
1028:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1030:                         upTriFactorT->csrMat->values->data().get(),
1031:                         upTriFactorT->csrMat->row_offsets->data().get(),
1032:                         upTriFactorT->csrMat->column_indices->data().get(),
1033:                         upTriFactorT->solveInfo,
1034:                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1036:   /* Then, solve L */
1037:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1039:                         loTriFactorT->csrMat->values->data().get(),
1040:                         loTriFactorT->csrMat->row_offsets->data().get(),
1041:                         loTriFactorT->csrMat->column_indices->data().get(),
1042:                         loTriFactorT->solveInfo,
1043:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1045:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046:   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1047:                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1048:                tempGPU->begin());

1050:   /* Copy the temporary to the full solution. */
1051:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());

1053:   /* restore */
1054:   VecCUSPRestoreArrayRead(bb,&bGPU);
1055:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1056:   WaitForGPU();CHKERRCUSP(ierr);

1058:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1059:   return(0);
1060: }

1064: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1065: {
1066:   CUSPARRAY                         *xGPU,*bGPU;
1067:   cusparseStatus_t                  stat;
1068:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1069:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1070:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1071:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1072:   PetscErrorCode                    ierr;

1075:   /* Analyze the matrix and create the transpose ... on the fly */
1076:   if (!loTriFactorT && !upTriFactorT) {
1077:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1078:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080:   }

1082:   /* Get the GPU pointers */
1083:   VecCUSPGetArrayWrite(xx,&xGPU);
1084:   VecCUSPGetArrayRead(bb,&bGPU);

1086:   /* First, solve U */
1087:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1089:                         upTriFactorT->csrMat->values->data().get(),
1090:                         upTriFactorT->csrMat->row_offsets->data().get(),
1091:                         upTriFactorT->csrMat->column_indices->data().get(),
1092:                         upTriFactorT->solveInfo,
1093:                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1095:   /* Then, solve L */
1096:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1098:                         loTriFactorT->csrMat->values->data().get(),
1099:                         loTriFactorT->csrMat->row_offsets->data().get(),
1100:                         loTriFactorT->csrMat->column_indices->data().get(),
1101:                         loTriFactorT->solveInfo,
1102:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1104:   /* restore */
1105:   VecCUSPRestoreArrayRead(bb,&bGPU);
1106:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1107:   WaitForGPU();CHKERRCUSP(ierr);
1108:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1109:   return(0);
1110: }

1114: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115: {
1116:   CUSPARRAY                         *xGPU,*bGPU;
1117:   cusparseStatus_t                  stat;
1118:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1119:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1120:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1121:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1122:   PetscErrorCode                    ierr;
1123:   VecType                           t;
1124:   PetscBool                         flg;

1127:   VecGetType(bb,&t);
1128:   PetscStrcmp(t,VECSEQCUSP,&flg);
1129:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1130:   VecGetType(xx,&t);
1131:   PetscStrcmp(t,VECSEQCUSP,&flg);
1132:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);

1134:   /* Get the GPU pointers */
1135:   VecCUSPGetArrayWrite(xx,&xGPU);
1136:   VecCUSPGetArrayRead(bb,&bGPU);

1138:   /* First, reorder with the row permutation */
1139:   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1140:                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1141:                xGPU->begin());

1143:   /* Next, solve L */
1144:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1145:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1146:                         loTriFactor->csrMat->values->data().get(),
1147:                         loTriFactor->csrMat->row_offsets->data().get(),
1148:                         loTriFactor->csrMat->column_indices->data().get(),
1149:                         loTriFactor->solveInfo,
1150:                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1152:   /* Then, solve U */
1153:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1154:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1155:                         upTriFactor->csrMat->values->data().get(),
1156:                         upTriFactor->csrMat->row_offsets->data().get(),
1157:                         upTriFactor->csrMat->column_indices->data().get(),
1158:                         upTriFactor->solveInfo,
1159:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1161:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1162:   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1163:                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1164:                tempGPU->begin());

1166:   /* Copy the temporary to the full solution. */
1167:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());

1169:   VecCUSPRestoreArrayRead(bb,&bGPU);
1170:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1171:   WaitForGPU();CHKERRCUSP(ierr);
1172:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1173:   return(0);
1174: }

1178: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1179: {
1180:   CUSPARRAY                         *xGPU,*bGPU;
1181:   cusparseStatus_t                  stat;
1182:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1183:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1184:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1185:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1186:   PetscErrorCode                    ierr;

1189:   /* Get the GPU pointers */
1190:   VecCUSPGetArrayWrite(xx,&xGPU);
1191:   VecCUSPGetArrayRead(bb,&bGPU);

1193:   /* First, solve L */
1194:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1195:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1196:                         loTriFactor->csrMat->values->data().get(),
1197:                         loTriFactor->csrMat->row_offsets->data().get(),
1198:                         loTriFactor->csrMat->column_indices->data().get(),
1199:                         loTriFactor->solveInfo,
1200:                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);

1202:   /* Next, solve U */
1203:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1204:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1205:                         upTriFactor->csrMat->values->data().get(),
1206:                         upTriFactor->csrMat->row_offsets->data().get(),
1207:                         upTriFactor->csrMat->column_indices->data().get(),
1208:                         upTriFactor->solveInfo,
1209:                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);

1211:   VecCUSPRestoreArrayRead(bb,&bGPU);
1212:   VecCUSPRestoreArrayWrite(xx,&xGPU);
1213:   WaitForGPU();CHKERRCUSP(ierr);
1214:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1215:   return(0);
1216: }

1220: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1221: {

1223:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1224:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1226:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1227:   PetscErrorCode               ierr;
1228:   cusparseStatus_t             stat;
1229:   cudaError_t                  err;

1232:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1233:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1234:     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1235:     try {
1236:       cusparsestruct->nonzerorow=0;
1237:       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1239:       if (a->compressedrow.use) {
1240:         m    = a->compressedrow.nrows;
1241:         ii   = a->compressedrow.i;
1242:         ridx = a->compressedrow.rindex;
1243:       } else {
1244:         /* Forcing compressed row on the GPU */
1245:         int k=0;
1246:         PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1247:         PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1248:         ii[0]=0;
1249:         for (int j = 0; j<m; j++) {
1250:           if ((a->i[j+1]-a->i[j])>0) {
1251:             ii[k]  = a->i[j];
1252:             ridx[k]= j;
1253:             k++;
1254:           }
1255:         }
1256:         ii[cusparsestruct->nonzerorow] = a->nz;
1257:         m = cusparsestruct->nonzerorow;
1258:       }

1260:       /* allocate space for the triangular factor information */
1261:       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1262:       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1263:       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1264:       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);

1266:       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1267:       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1268:       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1269:       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1270:       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);

1272:       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1273:       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1274: /* set the matrix */
1275:         CsrMatrix *matrix= new CsrMatrix;
1276:         matrix->num_rows = m;
1277:         matrix->num_cols = A->cmap->n;
1278:         matrix->num_entries = a->nz;
1279:         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280:         matrix->row_offsets->assign(ii, ii + m+1);

1282:         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283:         matrix->column_indices->assign(a->j, a->j+a->nz);

1285:         matrix->values = new THRUSTARRAY(a->nz);
1286:         matrix->values->assign(a->a, a->a+a->nz);

1288: /* assign the pointer */
1289:         matstruct->mat = matrix;

1291:       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1292: #if CUDA_VERSION>=4020
1293:         CsrMatrix *matrix= new CsrMatrix;
1294:         matrix->num_rows = m;
1295:         matrix->num_cols = A->cmap->n;
1296:         matrix->num_entries = a->nz;
1297:         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1298:         matrix->row_offsets->assign(ii, ii + m+1);

1300:         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1301:         matrix->column_indices->assign(a->j, a->j+a->nz);

1303:         matrix->values = new THRUSTARRAY(a->nz);
1304:         matrix->values->assign(a->a, a->a+a->nz);

1306:         cusparseHybMat_t hybMat;
1307:         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1308:         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309:           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310:         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1311:                                 matstruct->descr, matrix->values->data().get(),
1312:                                 matrix->row_offsets->data().get(),
1313:                                 matrix->column_indices->data().get(),
1314:                                 hybMat, 0, partition);CHKERRCUSP(stat);
1315:         /* assign the pointer */
1316:         matstruct->mat = hybMat;

1318:         if (matrix) {
1319:           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1320:           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1321:           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1322:           delete (CsrMatrix*)matrix;
1323:         }
1324: #endif
1325:       }

1327:       /* assign the compressed row indices */
1328:       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1329:       matstruct->cprowIndices->assign(ridx,ridx+m);

1331:       /* assign the pointer */
1332:       cusparsestruct->mat = matstruct;

1334:       if (!a->compressedrow.use) {
1335:         PetscFree(ii);
1336:         PetscFree(ridx);
1337:       }
1338:       cusparsestruct->workVector = new THRUSTARRAY;
1339:       cusparsestruct->workVector->resize(m);
1340:     } catch(char *ex) {
1341:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1342:     }
1343:     WaitForGPU();CHKERRCUSP(ierr);

1345:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;

1347:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1348:   }
1349:   return(0);
1350: }

1354: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1355: {
1357:   PetscInt rbs,cbs;

1360:   MatGetBlockSizes(mat,&rbs,&cbs);
1361:   if (right) {
1362:     VecCreate(PetscObjectComm((PetscObject)mat),right);
1363:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1364:     VecSetBlockSize(*right,cbs);
1365:     VecSetType(*right,VECSEQCUSP);
1366:     PetscLayoutReference(mat->cmap,&(*right)->map);
1367:   }
1368:   if (left) {
1369:     VecCreate(PetscObjectComm((PetscObject)mat),left);
1370:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1371:     VecSetBlockSize(*left,rbs);
1372:     VecSetType(*left,VECSEQCUSP);
1373:     PetscLayoutReference(mat->rmap,&(*left)->map);
1374:   }
1375:   return(0);
1376: }

1378: struct VecCUSPPlusEquals
1379: {
1380:   template <typename Tuple>
1381:   __host__ __device__
1382:   void operator()(Tuple t)
1383:   {
1384:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1385:   }
1386: };

1390: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1391: {
1392:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1393:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1394:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1395:   CUSPARRAY                    *xarray,*yarray;
1396:   PetscErrorCode               ierr;
1397:   cusparseStatus_t             stat;

1400:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1401:      MatSeqAIJCUSPARSECopyToGPU(A); */
1402:   VecCUSPGetArrayRead(xx,&xarray);
1403:   VecCUSPGetArrayWrite(yy,&yarray);
1404:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1405:     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1406:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407:                              mat->num_rows, mat->num_cols, mat->num_entries,
1408:                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1409:                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1410:                              yarray->data().get());CHKERRCUSP(stat);
1411:   } else {
1412: #if CUDA_VERSION>=4020
1413:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1414:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415:                              matstruct->alpha, matstruct->descr, hybMat,
1416:                              xarray->data().get(), matstruct->beta,
1417:                              yarray->data().get());CHKERRCUSP(stat);
1418: #endif
1419:   }
1420:   VecCUSPRestoreArrayRead(xx,&xarray);
1421:   VecCUSPRestoreArrayWrite(yy,&yarray);
1422:   if (!cusparsestruct->stream) {
1423:     WaitForGPU();CHKERRCUSP(ierr);
1424:   }
1425:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1426:   return(0);
1427: }

1431: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1432: {
1433:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1434:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1435:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436:   CUSPARRAY                    *xarray,*yarray;
1437:   PetscErrorCode               ierr;
1438:   cusparseStatus_t             stat;

1441:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1442:      MatSeqAIJCUSPARSECopyToGPU(A); */
1443:   if (!matstructT) {
1444:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1445:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1446:   }
1447:   VecCUSPGetArrayRead(xx,&xarray);
1448:   VecCUSPGetArrayWrite(yy,&yarray);

1450:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1451:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1452:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1453:                              mat->num_rows, mat->num_cols,
1454:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1455:                              mat->values->data().get(), mat->row_offsets->data().get(),
1456:                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1457:                              yarray->data().get());CHKERRCUSP(stat);
1458:   } else {
1459: #if CUDA_VERSION>=4020
1460:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1461:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1462:                              matstructT->alpha, matstructT->descr, hybMat,
1463:                              xarray->data().get(), matstructT->beta,
1464:                              yarray->data().get());CHKERRCUSP(stat);
1465: #endif
1466:   }
1467:   VecCUSPRestoreArrayRead(xx,&xarray);
1468:   VecCUSPRestoreArrayWrite(yy,&yarray);
1469:   if (!cusparsestruct->stream) {
1470:     WaitForGPU();CHKERRCUSP(ierr);
1471:   }
1472:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1473:   return(0);
1474: }


1479: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1480: {
1481:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1482:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1483:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1484:   CUSPARRAY                    *xarray,*yarray,*zarray;
1485:   PetscErrorCode               ierr;
1486:   cusparseStatus_t             stat;

1489:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1490:      MatSeqAIJCUSPARSECopyToGPU(A); */
1491:   try {
1492:     VecCopy_SeqCUSP(yy,zz);
1493:     VecCUSPGetArrayRead(xx,&xarray);
1494:     VecCUSPGetArrayRead(yy,&yarray);
1495:     VecCUSPGetArrayWrite(zz,&zarray);

1497:     /* multiply add */
1498:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1499:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1500:     /* here we need to be careful to set the number of rows in the multiply to the
1501:        number of compressed rows in the matrix ... which is equivalent to the
1502:        size of the workVector */
1503:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504:                                mat->num_rows, mat->num_cols,
1505:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1506:                                mat->values->data().get(), mat->row_offsets->data().get(),
1507:                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1508:                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1509:     } else {
1510: #if CUDA_VERSION>=4020
1511:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1512:       if (cusparsestruct->workVector->size()) {
1513:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1514:             matstruct->alpha, matstruct->descr, hybMat,
1515:             xarray->data().get(), matstruct->beta,
1516:             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1517:       }
1518: #endif
1519:     }

1521:     /* scatter the data from the temporary into the full vector with a += operation */
1522:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1523:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1524:         VecCUSPPlusEquals());
1525:     VecCUSPRestoreArrayRead(xx,&xarray);
1526:     VecCUSPRestoreArrayRead(yy,&yarray);
1527:     VecCUSPRestoreArrayWrite(zz,&zarray);

1529:   } catch(char *ex) {
1530:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1531:   }
1532:   WaitForGPU();CHKERRCUSP(ierr);
1533:   PetscLogFlops(2.0*a->nz);
1534:   return(0);
1535: }

1539: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1540: {
1541:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1542:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1543:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544:   CUSPARRAY                    *xarray,*yarray,*zarray;
1545:   PetscErrorCode               ierr;
1546:   cusparseStatus_t             stat;

1549:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1550:      MatSeqAIJCUSPARSECopyToGPU(A); */
1551:   if (!matstructT) {
1552:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1553:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1554:   }

1556:   try {
1557:     VecCopy_SeqCUSP(yy,zz);
1558:     VecCUSPGetArrayRead(xx,&xarray);
1559:     VecCUSPGetArrayRead(yy,&yarray);
1560:     VecCUSPGetArrayWrite(zz,&zarray);

1562:     /* multiply add with matrix transpose */
1563:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1564:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1565:       /* here we need to be careful to set the number of rows in the multiply to the
1566:          number of compressed rows in the matrix ... which is equivalent to the
1567:          size of the workVector */
1568:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1569:                                mat->num_rows, mat->num_cols,
1570:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1571:                                mat->values->data().get(), mat->row_offsets->data().get(),
1572:                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1573:                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1574:     } else {
1575: #if CUDA_VERSION>=4020
1576:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1577:       if (cusparsestruct->workVector->size()) {
1578:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1579:             matstructT->alpha, matstructT->descr, hybMat,
1580:             xarray->data().get(), matstructT->beta,
1581:             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1582:       }
1583: #endif
1584:     }

1586:     /* scatter the data from the temporary into the full vector with a += operation */
1587:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1588:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1589:         VecCUSPPlusEquals());

1591:     VecCUSPRestoreArrayRead(xx,&xarray);
1592:     VecCUSPRestoreArrayRead(yy,&yarray);
1593:     VecCUSPRestoreArrayWrite(zz,&zarray);

1595:   } catch(char *ex) {
1596:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1597:   }
1598:   WaitForGPU();CHKERRCUSP(ierr);
1599:   PetscLogFlops(2.0*a->nz);
1600:   return(0);
1601: }

1605: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1606: {

1610:   MatAssemblyEnd_SeqAIJ(A,mode);
1611:   if (A->factortype==MAT_FACTOR_NONE) {
1612:     MatSeqAIJCUSPARSECopyToGPU(A);
1613:   }
1614:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1615:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1616:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1617:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1618:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1619:   return(0);
1620: }

1622: /* --------------------------------------------------------------------------------*/
1625: /*@
1626:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1627:    (the default parallel PETSc format). This matrix will ultimately pushed down
1628:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1629:    assembly performance the user should preallocate the matrix storage by setting
1630:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1631:    performance during matrix assembly can be increased by more than a factor of 50.

1633:    Collective on MPI_Comm

1635:    Input Parameters:
1636: +  comm - MPI communicator, set to PETSC_COMM_SELF
1637: .  m - number of rows
1638: .  n - number of columns
1639: .  nz - number of nonzeros per row (same for all rows)
1640: -  nnz - array containing the number of nonzeros in the various rows
1641:          (possibly different for each row) or NULL

1643:    Output Parameter:
1644: .  A - the matrix

1646:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1647:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1650:    Notes:
1651:    If nnz is given then nz is ignored

1653:    The AIJ format (also called the Yale sparse matrix format or
1654:    compressed row storage), is fully compatible with standard Fortran 77
1655:    storage.  That is, the stored row and column indices can begin at
1656:    either one (as in Fortran) or zero.  See the users' manual for details.

1658:    Specify the preallocated storage with either nz or nnz (not both).
1659:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1660:    allocation.  For large problems you MUST preallocate memory or you
1661:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1663:    By default, this format uses inodes (identical nodes) when possible, to
1664:    improve numerical efficiency of matrix-vector products and solves. We
1665:    search for consecutive rows with the same nonzero structure, thereby
1666:    reusing matrix information to achieve increased efficiency.

1668:    Level: intermediate

1670: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1671: @*/
1672: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1673: {

1677:   MatCreate(comm,A);
1678:   MatSetSizes(*A,m,n,m,n);
1679:   MatSetType(*A,MATSEQAIJCUSPARSE);
1680:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1681:   return(0);
1682: }

1686: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1687: {
1688:   PetscErrorCode   ierr;

1691:   if (A->factortype==MAT_FACTOR_NONE) {
1692:     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1693:       Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1694:     }
1695:   } else {
1696:     Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1697:   }
1698:   MatDestroy_SeqAIJ(A);
1699:   return(0);
1700: }

1704: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1705: {
1707:   cusparseStatus_t stat;
1708:   cusparseHandle_t handle=0;

1711:   MatCreate_SeqAIJ(B);
1712:   if (B->factortype==MAT_FACTOR_NONE) {
1713:     /* you cannot check the inode.use flag here since the matrix was just created.
1714:        now build a GPU matrix data structure */
1715:     B->spptr = new Mat_SeqAIJCUSPARSE;
1716:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1717:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1718:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1719:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1720:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1721:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1722:     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1723:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1724:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1725:   } else {
1726:     /* NEXT, set the pointers to the triangular factors */
1727:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1728:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1729:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1730:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1731:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1732:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1733:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1734:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1735:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1736:     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1737:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1738:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1739:   }

1741:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1742:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1743:   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1744:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1745:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1746:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1747:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1748:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;

1750:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1752:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;

1754:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1755:   return(0);
1756: }

1758: /*M
1759:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1761:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1762:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1763:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1765:    Options Database Keys:
1766: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1767: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1768: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1770:   Level: beginner

1772: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1773: M*/

1775: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1780: PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1781: {

1785:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1786:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1787:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1788:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1789:   return(0);
1790: }


1795: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1796: {
1797:   cusparseStatus_t stat;
1798:   cusparseHandle_t handle;

1801:   if (*cusparsestruct) {
1802:     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1803:     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1804:     delete (*cusparsestruct)->workVector;
1805:     if (handle = (*cusparsestruct)->handle) {
1806:       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1807:     }
1808:     delete *cusparsestruct;
1809:     *cusparsestruct = 0;
1810:   }
1811:   return(0);
1812: }

1816: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1817: {
1819:   if (*mat) {
1820:     delete (*mat)->values;
1821:     delete (*mat)->column_indices;
1822:     delete (*mat)->row_offsets;
1823:     delete *mat;
1824:     *mat = 0;
1825:   }
1826:   return(0);
1827: }

1831: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1832: {
1833:   cusparseStatus_t stat;
1834:   PetscErrorCode   ierr;

1837:   if (*trifactor) {
1838:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1839:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1840:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1841:     delete *trifactor;
1842:     *trifactor = 0;
1843:   }
1844:   return(0);
1845: }

1849: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1850: {
1851:   CsrMatrix        *mat;
1852:   cusparseStatus_t stat;
1853:   cudaError_t      err;

1856:   if (*matstruct) {
1857:     if ((*matstruct)->mat) {
1858:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1859:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1860:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1861:       } else {
1862:         mat = (CsrMatrix*)(*matstruct)->mat;
1863:         CsrMatrix_Destroy(&mat);
1864:       }
1865:     }
1866:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1867:     delete (*matstruct)->cprowIndices;
1868:     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1869:     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1870:     delete *matstruct;
1871:     *matstruct = 0;
1872:   }
1873:   return(0);
1874: }

1878: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1879: {
1880:   cusparseHandle_t handle;
1881:   cusparseStatus_t stat;

1884:   if (*trifactors) {
1885:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1886:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1887:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1888:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1889:     delete (*trifactors)->rpermIndices;
1890:     delete (*trifactors)->cpermIndices;
1891:     delete (*trifactors)->workVector;
1892:     if (handle = (*trifactors)->handle) {
1893:       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1894:     }
1895:     delete *trifactors;
1896:     *trifactors = 0;
1897:   }
1898:   return(0);
1899: }