petsc-3.11.4 2019-09-28
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <../src/mat/impls/aij/seq/aij.h>
  9:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 36: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 38: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 39: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 42: {
 43:   cusparseStatus_t   stat;
 44:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 47:   cusparsestruct->stream = stream;
 48:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 49:   return(0);
 50: }

 52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 53: {
 54:   cusparseStatus_t   stat;
 55:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 58:   if (cusparsestruct->handle != handle) {
 59:     if (cusparsestruct->handle) {
 60:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 61:     }
 62:     cusparsestruct->handle = handle;
 63:   }
 64:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 65:   return(0);
 66: }

 68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 69: {
 70:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 72:   if (cusparsestruct->handle)
 73:     cusparsestruct->handle = 0;
 74:   return(0);
 75: }

 77: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 78: {
 80:   *type = MATSOLVERCUSPARSE;
 81:   return(0);
 82: }

 84: /*MC
 85:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 86:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 87:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 88:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 89:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 90:   algorithms are not recommended. This class does NOT support direct solver operations.

 92:   Level: beginner

 94: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 95: M*/

 97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
 98: {
100:   PetscInt       n = A->rmap->n;

103:   MatCreate(PetscObjectComm((PetscObject)A),B);
104:   (*B)->factortype = ftype;
105:   MatSetSizes(*B,n,n,n,n);
106:   MatSetType(*B,MATSEQAIJCUSPARSE);

108:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109:     MatSetBlockSizesFromMats(*B,A,A);
110:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

117:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
119:   return(0);
120: }

122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

127: #if CUDA_VERSION>=4020
128:   switch (op) {
129:   case MAT_CUSPARSE_MULT:
130:     cusparsestruct->format = format;
131:     break;
132:   case MAT_CUSPARSE_ALL:
133:     cusparsestruct->format = format;
134:     break;
135:   default:
136:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137:   }
138: #else
139:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141:   return(0);
142: }

144: /*@
145:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146:    operation. Only the MatMult operation can use different GPU storage formats
147:    for MPIAIJCUSPARSE matrices.
148:    Not Collective

150:    Input Parameters:
151: +  A - Matrix of type SEQAIJCUSPARSE
152: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

155:    Output Parameter:

157:    Level: intermediate

159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {

167:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168:   return(0);
169: }

171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173:   PetscErrorCode           ierr;
174:   MatCUSPARSEStorageFormat format;
175:   PetscBool                flg;
176:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

179:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180:   if (A->factortype==MAT_FACTOR_NONE) {
181:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
183:     if (flg) {
184:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
185:     }
186:   }
187:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
189:   if (flg) {
190:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
191:   }
192:   PetscOptionsTail();
193:   return(0);

195: }

197: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198: {

202:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
203:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204:   return(0);
205: }

207: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208: {

212:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
213:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214:   return(0);
215: }

217: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218: {

222:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
223:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224:   return(0);
225: }

227: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228: {

232:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
233:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234:   return(0);
235: }

237: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238: {
239:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240:   PetscInt                          n = A->rmap->n;
241:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243:   cusparseStatus_t                  stat;
244:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245:   const MatScalar                   *aa = a->a,*v;
246:   PetscInt                          *AiLo, *AjLo;
247:   PetscScalar                       *AALo;
248:   PetscInt                          i,nz, nzLower, offset, rowOffset;
249:   PetscErrorCode                    ierr;

252:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253:     try {
254:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255:       nzLower=n+ai[n]-ai[1];

257:       /* Allocate Space for the lower triangular matrix */
258:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

262:       /* Fill the lower triangular matrix */
263:       AiLo[0]  = (PetscInt) 0;
264:       AiLo[n]  = nzLower;
265:       AjLo[0]  = (PetscInt) 0;
266:       AALo[0]  = (MatScalar) 1.0;
267:       v        = aa;
268:       vi       = aj;
269:       offset   = 1;
270:       rowOffset= 1;
271:       for (i=1; i<n; i++) {
272:         nz = ai[i+1] - ai[i];
273:         /* additional 1 for the term on the diagonal */
274:         AiLo[i]    = rowOffset;
275:         rowOffset += nz+1;

277:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
278:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

280:         offset      += nz;
281:         AjLo[offset] = (PetscInt) i;
282:         AALo[offset] = (MatScalar) 1.0;
283:         offset      += 1;

285:         v  += nz;
286:         vi += nz;
287:       }

289:       /* allocate space for the triangular factor information */
290:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

292:       /* Create the matrix description */
293:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

299:       /* Create the solve analysis information */
300:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

302:       /* set the operation */
303:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

305:       /* set the matrix */
306:       loTriFactor->csrMat = new CsrMatrix;
307:       loTriFactor->csrMat->num_rows = n;
308:       loTriFactor->csrMat->num_cols = n;
309:       loTriFactor->csrMat->num_entries = nzLower;

311:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

314:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

317:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

320:       /* perform the solve analysis */
321:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

326:       /* assign the pointer. Is this really necessary? */
327:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

329:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
332:     } catch(char *ex) {
333:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334:     }
335:   }
336:   return(0);
337: }

339: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340: {
341:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342:   PetscInt                          n = A->rmap->n;
343:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345:   cusparseStatus_t                  stat;
346:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347:   const MatScalar                   *aa = a->a,*v;
348:   PetscInt                          *AiUp, *AjUp;
349:   PetscScalar                       *AAUp;
350:   PetscInt                          i,nz, nzUpper, offset;
351:   PetscErrorCode                    ierr;

354:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355:     try {
356:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357:       nzUpper = adiag[0]-adiag[n];

359:       /* Allocate Space for the upper triangular matrix */
360:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

364:       /* Fill the upper triangular matrix */
365:       AiUp[0]=(PetscInt) 0;
366:       AiUp[n]=nzUpper;
367:       offset = nzUpper;
368:       for (i=n-1; i>=0; i--) {
369:         v  = aa + adiag[i+1] + 1;
370:         vi = aj + adiag[i+1] + 1;

372:         /* number of elements NOT on the diagonal */
373:         nz = adiag[i] - adiag[i+1]-1;

375:         /* decrement the offset */
376:         offset -= (nz+1);

378:         /* first, set the diagonal elements */
379:         AjUp[offset] = (PetscInt) i;
380:         AAUp[offset] = (MatScalar)1./v[nz];
381:         AiUp[i]      = AiUp[i+1] - (nz+1);

383:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
384:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
385:       }

387:       /* allocate space for the triangular factor information */
388:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

390:       /* Create the matrix description */
391:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

397:       /* Create the solve analysis information */
398:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

400:       /* set the operation */
401:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

403:       /* set the matrix */
404:       upTriFactor->csrMat = new CsrMatrix;
405:       upTriFactor->csrMat->num_rows = n;
406:       upTriFactor->csrMat->num_cols = n;
407:       upTriFactor->csrMat->num_entries = nzUpper;

409:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

412:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

415:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

418:       /* perform the solve analysis */
419:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

424:       /* assign the pointer. Is this really necessary? */
425:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

427:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430:     } catch(char *ex) {
431:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432:     }
433:   }
434:   return(0);
435: }

437: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438: {
439:   PetscErrorCode               ierr;
440:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442:   IS                           isrow = a->row,iscol = a->icol;
443:   PetscBool                    row_identity,col_identity;
444:   const PetscInt               *r,*c;
445:   PetscInt                     n = A->rmap->n;

448:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
449:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

451:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452:   cusparseTriFactors->nnz=a->nz;

454:   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455:   /*lower triangular indices */
456:   ISGetIndices(isrow,&r);
457:   ISIdentity(isrow,&row_identity);
458:   if (!row_identity) {
459:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460:     cusparseTriFactors->rpermIndices->assign(r, r+n);
461:   }
462:   ISRestoreIndices(isrow,&r);

464:   /*upper triangular indices */
465:   ISGetIndices(iscol,&c);
466:   ISIdentity(iscol,&col_identity);
467:   if (!col_identity) {
468:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469:     cusparseTriFactors->cpermIndices->assign(c, c+n);
470:   }
471:   ISRestoreIndices(iscol,&c);
472:   return(0);
473: }

475: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476: {
477:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
478:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481:   cusparseStatus_t                  stat;
482:   PetscErrorCode                    ierr;
483:   PetscInt                          *AiUp, *AjUp;
484:   PetscScalar                       *AAUp;
485:   PetscScalar                       *AALo;
486:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
488:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
489:   const MatScalar                   *aa = b->a,*v;

492:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
493:     try {
494:       /* Allocate Space for the upper triangular matrix */
495:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

500:       /* Fill the upper triangular matrix */
501:       AiUp[0]=(PetscInt) 0;
502:       AiUp[n]=nzUpper;
503:       offset = 0;
504:       for (i=0; i<n; i++) {
505:         /* set the pointers */
506:         v  = aa + ai[i];
507:         vj = aj + ai[i];
508:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

510:         /* first, set the diagonal elements */
511:         AjUp[offset] = (PetscInt) i;
512:         AAUp[offset] = (MatScalar)1.0/v[nz];
513:         AiUp[i]      = offset;
514:         AALo[offset] = (MatScalar)1.0/v[nz];

516:         offset+=1;
517:         if (nz>0) {
518:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
519:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
520:           for (j=offset; j<offset+nz; j++) {
521:             AAUp[j] = -AAUp[j];
522:             AALo[j] = AAUp[j]/v[nz];
523:           }
524:           offset+=nz;
525:         }
526:       }

528:       /* allocate space for the triangular factor information */
529:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

531:       /* Create the matrix description */
532:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

538:       /* Create the solve analysis information */
539:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

541:       /* set the operation */
542:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

544:       /* set the matrix */
545:       upTriFactor->csrMat = new CsrMatrix;
546:       upTriFactor->csrMat->num_rows = A->rmap->n;
547:       upTriFactor->csrMat->num_cols = A->cmap->n;
548:       upTriFactor->csrMat->num_entries = a->nz;

550:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

553:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

556:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

559:       /* perform the solve analysis */
560:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

565:       /* assign the pointer. Is this really necessary? */
566:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

568:       /* allocate space for the triangular factor information */
569:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

571:       /* Create the matrix description */
572:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

578:       /* Create the solve analysis information */
579:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

581:       /* set the operation */
582:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

584:       /* set the matrix */
585:       loTriFactor->csrMat = new CsrMatrix;
586:       loTriFactor->csrMat->num_rows = A->rmap->n;
587:       loTriFactor->csrMat->num_cols = A->cmap->n;
588:       loTriFactor->csrMat->num_entries = a->nz;

590:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

593:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

596:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

599:       /* perform the solve analysis */
600:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

605:       /* assign the pointer. Is this really necessary? */
606:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

608:       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
609:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
613:     } catch(char *ex) {
614:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615:     }
616:   }
617:   return(0);
618: }

620: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621: {
622:   PetscErrorCode               ierr;
623:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
624:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625:   IS                           ip = a->row;
626:   const PetscInt               *rip;
627:   PetscBool                    perm_identity;
628:   PetscInt                     n = A->rmap->n;

631:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
632:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
633:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

635:   /*lower triangular indices */
636:   ISGetIndices(ip,&rip);
637:   ISIdentity(ip,&perm_identity);
638:   if (!perm_identity) {
639:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
640:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
641:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
642:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
643:   }
644:   ISRestoreIndices(ip,&rip);
645:   return(0);
646: }

648: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
649: {
650:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
651:   IS             isrow = b->row,iscol = b->col;
652:   PetscBool      row_identity,col_identity;

656:   MatLUFactorNumeric_SeqAIJ(B,A,info);
657:   /* determine which version of MatSolve needs to be used. */
658:   ISIdentity(isrow,&row_identity);
659:   ISIdentity(iscol,&col_identity);
660:   if (row_identity && col_identity) {
661:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
662:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
663:   } else {
664:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
665:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
666:   }

668:   /* get the triangular factors */
669:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
670:   return(0);
671: }

673: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
674: {
675:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
676:   IS             ip = b->row;
677:   PetscBool      perm_identity;

681:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

683:   /* determine which version of MatSolve needs to be used. */
684:   ISIdentity(ip,&perm_identity);
685:   if (perm_identity) {
686:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
687:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
688:   } else {
689:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
690:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
691:   }

693:   /* get the triangular factors */
694:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
695:   return(0);
696: }

698: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
699: {
700:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
701:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
702:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
703:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
705:   cusparseStatus_t                  stat;
706:   cusparseIndexBase_t               indexBase;
707:   cusparseMatrixType_t              matrixType;
708:   cusparseFillMode_t                fillMode;
709:   cusparseDiagType_t                diagType;


713:   /*********************************************/
714:   /* Now the Transpose of the Lower Tri Factor */
715:   /*********************************************/

717:   /* allocate space for the transpose of the lower triangular factor */
718:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

720:   /* set the matrix descriptors of the lower triangular factor */
721:   matrixType = cusparseGetMatType(loTriFactor->descr);
722:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
723:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
724:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
725:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

727:   /* Create the matrix description */
728:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
729:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
730:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
731:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
732:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

734:   /* Create the solve analysis information */
735:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

737:   /* set the operation */
738:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

740:   /* allocate GPU space for the CSC of the lower triangular factor*/
741:   loTriFactorT->csrMat = new CsrMatrix;
742:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
743:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
744:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
745:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
746:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
747:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

749:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
750:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
751:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
752:                           loTriFactor->csrMat->values->data().get(),
753:                           loTriFactor->csrMat->row_offsets->data().get(),
754:                           loTriFactor->csrMat->column_indices->data().get(),
755:                           loTriFactorT->csrMat->values->data().get(),
756:                           loTriFactorT->csrMat->column_indices->data().get(),
757:                           loTriFactorT->csrMat->row_offsets->data().get(),
758:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

760:   /* perform the solve analysis on the transposed matrix */
761:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
762:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
763:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
764:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
765:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

767:   /* assign the pointer. Is this really necessary? */
768:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

770:   /*********************************************/
771:   /* Now the Transpose of the Upper Tri Factor */
772:   /*********************************************/

774:   /* allocate space for the transpose of the upper triangular factor */
775:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

777:   /* set the matrix descriptors of the upper triangular factor */
778:   matrixType = cusparseGetMatType(upTriFactor->descr);
779:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
780:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
781:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
782:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

784:   /* Create the matrix description */
785:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
786:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
787:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
788:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
789:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

791:   /* Create the solve analysis information */
792:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

794:   /* set the operation */
795:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

797:   /* allocate GPU space for the CSC of the upper triangular factor*/
798:   upTriFactorT->csrMat = new CsrMatrix;
799:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
800:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
801:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
802:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
803:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
804:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

806:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
807:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
808:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
809:                           upTriFactor->csrMat->values->data().get(),
810:                           upTriFactor->csrMat->row_offsets->data().get(),
811:                           upTriFactor->csrMat->column_indices->data().get(),
812:                           upTriFactorT->csrMat->values->data().get(),
813:                           upTriFactorT->csrMat->column_indices->data().get(),
814:                           upTriFactorT->csrMat->row_offsets->data().get(),
815:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

817:   /* perform the solve analysis on the transposed matrix */
818:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
819:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
820:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
821:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
822:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

824:   /* assign the pointer. Is this really necessary? */
825:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
826:   return(0);
827: }

829: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
830: {
831:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
832:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
833:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
834:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
835:   cusparseStatus_t             stat;
836:   cusparseIndexBase_t          indexBase;
837:   cudaError_t                  err;


841:   /* allocate space for the triangular factor information */
842:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
843:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
844:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
845:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
846:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

848:   /* set alpha and beta */
849:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
850:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
851:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
852:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
853:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
855:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

857:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
858:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
859:     CsrMatrix *matrixT= new CsrMatrix;
860:     matrixT->num_rows = A->cmap->n;
861:     matrixT->num_cols = A->rmap->n;
862:     matrixT->num_entries = a->nz;
863:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
864:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
865:     matrixT->values = new THRUSTARRAY(a->nz);

867:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
868:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
869:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
870:                             matrix->num_cols, matrix->num_entries,
871:                             matrix->values->data().get(),
872:                             matrix->row_offsets->data().get(),
873:                             matrix->column_indices->data().get(),
874:                             matrixT->values->data().get(),
875:                             matrixT->column_indices->data().get(),
876:                             matrixT->row_offsets->data().get(),
877:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

879:     /* assign the pointer */
880:     matstructT->mat = matrixT;

882:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
883: #if CUDA_VERSION>=5000
884:     /* First convert HYB to CSR */
885:     CsrMatrix *temp= new CsrMatrix;
886:     temp->num_rows = A->rmap->n;
887:     temp->num_cols = A->cmap->n;
888:     temp->num_entries = a->nz;
889:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
890:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
891:     temp->values = new THRUSTARRAY(a->nz);


894:     stat = cusparse_hyb2csr(cusparsestruct->handle,
895:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
896:                             temp->values->data().get(),
897:                             temp->row_offsets->data().get(),
898:                             temp->column_indices->data().get());CHKERRCUDA(stat);

900:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
901:     CsrMatrix *tempT= new CsrMatrix;
902:     tempT->num_rows = A->rmap->n;
903:     tempT->num_cols = A->cmap->n;
904:     tempT->num_entries = a->nz;
905:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
907:     tempT->values = new THRUSTARRAY(a->nz);

909:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
910:                             temp->num_cols, temp->num_entries,
911:                             temp->values->data().get(),
912:                             temp->row_offsets->data().get(),
913:                             temp->column_indices->data().get(),
914:                             tempT->values->data().get(),
915:                             tempT->column_indices->data().get(),
916:                             tempT->row_offsets->data().get(),
917:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

919:     /* Last, convert CSC to HYB */
920:     cusparseHybMat_t hybMat;
921:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
922:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
923:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
924:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
925:                             matstructT->descr, tempT->values->data().get(),
926:                             tempT->row_offsets->data().get(),
927:                             tempT->column_indices->data().get(),
928:                             hybMat, 0, partition);CHKERRCUDA(stat);

930:     /* assign the pointer */
931:     matstructT->mat = hybMat;

933:     /* delete temporaries */
934:     if (tempT) {
935:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
936:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
937:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
938:       delete (CsrMatrix*) tempT;
939:     }
940:     if (temp) {
941:       if (temp->values) delete (THRUSTARRAY*) temp->values;
942:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
943:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
944:       delete (CsrMatrix*) temp;
945:     }
946: #else
947:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
948: #endif
949:   }
950:   /* assign the compressed row indices */
951:   matstructT->cprowIndices = new THRUSTINTARRAY;
952:   matstructT->cprowIndices->resize(A->cmap->n);
953:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());

955:   /* assign the pointer */
956:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
957:   return(0);
958: }

960: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
961: {
962:   PetscInt                              n = xx->map->n;
963:   const PetscScalar                     *barray;
964:   PetscScalar                           *xarray;
965:   thrust::device_ptr<const PetscScalar> bGPU;
966:   thrust::device_ptr<PetscScalar>       xGPU;
967:   cusparseStatus_t                      stat;
968:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
969:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
970:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
971:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
972:   PetscErrorCode                        ierr;

975:   /* Analyze the matrix and create the transpose ... on the fly */
976:   if (!loTriFactorT && !upTriFactorT) {
977:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
978:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980:   }

982:   /* Get the GPU pointers */
983:   VecCUDAGetArrayWrite(xx,&xarray);
984:   VecCUDAGetArrayRead(bb,&barray);
985:   xGPU = thrust::device_pointer_cast(xarray);
986:   bGPU = thrust::device_pointer_cast(barray);

988:   /* First, reorder with the row permutation */
989:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
990:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
991:                xGPU);

993:   /* First, solve U */
994:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
995:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
996:                         upTriFactorT->csrMat->values->data().get(),
997:                         upTriFactorT->csrMat->row_offsets->data().get(),
998:                         upTriFactorT->csrMat->column_indices->data().get(),
999:                         upTriFactorT->solveInfo,
1000:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1002:   /* Then, solve L */
1003:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1004:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1005:                         loTriFactorT->csrMat->values->data().get(),
1006:                         loTriFactorT->csrMat->row_offsets->data().get(),
1007:                         loTriFactorT->csrMat->column_indices->data().get(),
1008:                         loTriFactorT->solveInfo,
1009:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1011:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1012:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1013:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1014:                tempGPU->begin());

1016:   /* Copy the temporary to the full solution. */
1017:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1019:   /* restore */
1020:   VecCUDARestoreArrayRead(bb,&barray);
1021:   VecCUDARestoreArrayWrite(xx,&xarray);
1022:   WaitForGPU();CHKERRCUDA(ierr);

1024:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1025:   return(0);
1026: }

1028: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1029: {
1030:   const PetscScalar                 *barray;
1031:   PetscScalar                       *xarray;
1032:   cusparseStatus_t                  stat;
1033:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1034:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1035:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1036:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1037:   PetscErrorCode                    ierr;

1040:   /* Analyze the matrix and create the transpose ... on the fly */
1041:   if (!loTriFactorT && !upTriFactorT) {
1042:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1043:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1044:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1045:   }

1047:   /* Get the GPU pointers */
1048:   VecCUDAGetArrayWrite(xx,&xarray);
1049:   VecCUDAGetArrayRead(bb,&barray);

1051:   /* First, solve U */
1052:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1053:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1054:                         upTriFactorT->csrMat->values->data().get(),
1055:                         upTriFactorT->csrMat->row_offsets->data().get(),
1056:                         upTriFactorT->csrMat->column_indices->data().get(),
1057:                         upTriFactorT->solveInfo,
1058:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1060:   /* Then, solve L */
1061:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1062:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1063:                         loTriFactorT->csrMat->values->data().get(),
1064:                         loTriFactorT->csrMat->row_offsets->data().get(),
1065:                         loTriFactorT->csrMat->column_indices->data().get(),
1066:                         loTriFactorT->solveInfo,
1067:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1069:   /* restore */
1070:   VecCUDARestoreArrayRead(bb,&barray);
1071:   VecCUDARestoreArrayWrite(xx,&xarray);
1072:   WaitForGPU();CHKERRCUDA(ierr);
1073:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1074:   return(0);
1075: }

1077: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1078: {
1079:   const PetscScalar                     *barray;
1080:   PetscScalar                           *xarray;
1081:   thrust::device_ptr<const PetscScalar> bGPU;
1082:   thrust::device_ptr<PetscScalar>       xGPU;
1083:   cusparseStatus_t                      stat;
1084:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1085:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1086:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1087:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1088:   PetscErrorCode                        ierr;


1092:   /* Get the GPU pointers */
1093:   VecCUDAGetArrayWrite(xx,&xarray);
1094:   VecCUDAGetArrayRead(bb,&barray);
1095:   xGPU = thrust::device_pointer_cast(xarray);
1096:   bGPU = thrust::device_pointer_cast(barray);

1098:   /* First, reorder with the row permutation */
1099:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1100:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1101:                xGPU);

1103:   /* Next, solve L */
1104:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1105:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1106:                         loTriFactor->csrMat->values->data().get(),
1107:                         loTriFactor->csrMat->row_offsets->data().get(),
1108:                         loTriFactor->csrMat->column_indices->data().get(),
1109:                         loTriFactor->solveInfo,
1110:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1112:   /* Then, solve U */
1113:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1114:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1115:                         upTriFactor->csrMat->values->data().get(),
1116:                         upTriFactor->csrMat->row_offsets->data().get(),
1117:                         upTriFactor->csrMat->column_indices->data().get(),
1118:                         upTriFactor->solveInfo,
1119:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1121:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1122:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1123:                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1124:                tempGPU->begin());

1126:   /* Copy the temporary to the full solution. */
1127:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1129:   VecCUDARestoreArrayRead(bb,&barray);
1130:   VecCUDARestoreArrayWrite(xx,&xarray);
1131:   WaitForGPU();CHKERRCUDA(ierr);
1132:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1133:   return(0);
1134: }

1136: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1137: {
1138:   const PetscScalar                 *barray;
1139:   PetscScalar                       *xarray;
1140:   cusparseStatus_t                  stat;
1141:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1142:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1143:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1144:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1145:   PetscErrorCode                    ierr;

1148:   /* Get the GPU pointers */
1149:   VecCUDAGetArrayWrite(xx,&xarray);
1150:   VecCUDAGetArrayRead(bb,&barray);

1152:   /* First, solve L */
1153:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155:                         loTriFactor->csrMat->values->data().get(),
1156:                         loTriFactor->csrMat->row_offsets->data().get(),
1157:                         loTriFactor->csrMat->column_indices->data().get(),
1158:                         loTriFactor->solveInfo,
1159:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1161:   /* Next, solve U */
1162:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164:                         upTriFactor->csrMat->values->data().get(),
1165:                         upTriFactor->csrMat->row_offsets->data().get(),
1166:                         upTriFactor->csrMat->column_indices->data().get(),
1167:                         upTriFactor->solveInfo,
1168:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1170:   VecCUDARestoreArrayRead(bb,&barray);
1171:   VecCUDARestoreArrayWrite(xx,&xarray);
1172:   WaitForGPU();CHKERRCUDA(ierr);
1173:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1174:   return(0);
1175: }

1177: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1178: {

1180:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1181:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1182:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1183:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1184:   PetscErrorCode               ierr;
1185:   cusparseStatus_t             stat;
1186:   cudaError_t                  err;

1189:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1190:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1191:     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1192:       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1193:       /* copy values only */
1194:       matrix->values->assign(a->a, a->a+a->nz);
1195:     } else {
1196:       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1197:       try {
1198:         cusparsestruct->nonzerorow=0;
1199:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1201:         if (a->compressedrow.use) {
1202:           m    = a->compressedrow.nrows;
1203:           ii   = a->compressedrow.i;
1204:           ridx = a->compressedrow.rindex;
1205:         } else {
1206:           /* Forcing compressed row on the GPU */
1207:           int k=0;
1208:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1209:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1210:           ii[0]=0;
1211:           for (int j = 0; j<m; j++) {
1212:             if ((a->i[j+1]-a->i[j])>0) {
1213:               ii[k]  = a->i[j];
1214:               ridx[k]= j;
1215:               k++;
1216:             }
1217:           }
1218:           ii[cusparsestruct->nonzerorow] = a->nz;
1219:           m = cusparsestruct->nonzerorow;
1220:         }

1222:         /* allocate space for the triangular factor information */
1223:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1224:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1225:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1226:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1228:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1229:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1230:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1231:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1232:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1234:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1236:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1237:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1238:           /* set the matrix */
1239:           CsrMatrix *matrix= new CsrMatrix;
1240:           matrix->num_rows = m;
1241:           matrix->num_cols = A->cmap->n;
1242:           matrix->num_entries = a->nz;
1243:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1244:           matrix->row_offsets->assign(ii, ii + m+1);

1246:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1247:           matrix->column_indices->assign(a->j, a->j+a->nz);

1249:           matrix->values = new THRUSTARRAY(a->nz);
1250:           matrix->values->assign(a->a, a->a+a->nz);

1252:           /* assign the pointer */
1253:           matstruct->mat = matrix;

1255:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1256: #if CUDA_VERSION>=4020
1257:           CsrMatrix *matrix= new CsrMatrix;
1258:           matrix->num_rows = m;
1259:           matrix->num_cols = A->cmap->n;
1260:           matrix->num_entries = a->nz;
1261:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1262:           matrix->row_offsets->assign(ii, ii + m+1);

1264:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1265:           matrix->column_indices->assign(a->j, a->j+a->nz);

1267:           matrix->values = new THRUSTARRAY(a->nz);
1268:           matrix->values->assign(a->a, a->a+a->nz);

1270:           cusparseHybMat_t hybMat;
1271:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1272:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1273:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1274:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1275:               matstruct->descr, matrix->values->data().get(),
1276:               matrix->row_offsets->data().get(),
1277:               matrix->column_indices->data().get(),
1278:               hybMat, 0, partition);CHKERRCUDA(stat);
1279:           /* assign the pointer */
1280:           matstruct->mat = hybMat;

1282:           if (matrix) {
1283:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1284:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1285:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1286:             delete (CsrMatrix*)matrix;
1287:           }
1288: #endif
1289:         }

1291:         /* assign the compressed row indices */
1292:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1293:         matstruct->cprowIndices->assign(ridx,ridx+m);

1295:         /* assign the pointer */
1296:         cusparsestruct->mat = matstruct;

1298:         if (!a->compressedrow.use) {
1299:           PetscFree(ii);
1300:           PetscFree(ridx);
1301:         }
1302:         cusparsestruct->workVector = new THRUSTARRAY(m);
1303:       } catch(char *ex) {
1304:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305:       }
1306:       cusparsestruct->nonzerostate = A->nonzerostate;
1307:     }
1308:     WaitForGPU();CHKERRCUDA(ierr);
1309:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1311:   }
1312:   return(0);
1313: }

1315: struct VecCUDAPlusEquals
1316: {
1317:   template <typename Tuple>
1318:   __host__ __device__
1319:   void operator()(Tuple t)
1320:   {
1321:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1322:   }
1323: };

1325: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1326: {

1330:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1331:   return(0);
1332: }

1334: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1335: {
1336:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1337:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1338:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1339:   const PetscScalar            *xarray;
1340:   PetscScalar                  *yarray;
1341:   PetscErrorCode               ierr;
1342:   cusparseStatus_t             stat;

1345:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1346:   MatSeqAIJCUSPARSECopyToGPU(A);
1347:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1348:   if (!matstructT) {
1349:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1350:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1351:   }
1352:   VecCUDAGetArrayRead(xx,&xarray);
1353:   VecSet(yy,0);
1354:   VecCUDAGetArrayWrite(yy,&yarray);

1356:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1357:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1358:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1359:                              mat->num_rows, mat->num_cols,
1360:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1361:                              mat->values->data().get(), mat->row_offsets->data().get(),
1362:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1363:                              yarray);CHKERRCUDA(stat);
1364:   } else {
1365: #if CUDA_VERSION>=4020
1366:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1367:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1368:                              matstructT->alpha, matstructT->descr, hybMat,
1369:                              xarray, matstructT->beta_zero,
1370:                              yarray);CHKERRCUDA(stat);
1371: #endif
1372:   }
1373:   VecCUDARestoreArrayRead(xx,&xarray);
1374:   VecCUDARestoreArrayWrite(yy,&yarray);
1375:   if (!cusparsestruct->stream) {
1376:     WaitForGPU();CHKERRCUDA(ierr);
1377:   }
1378:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1379:   return(0);
1380: }


1383: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1384: {
1385:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1386:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1387:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1388:   const PetscScalar            *xarray;
1389:   PetscScalar                  *zarray,*dptr,*beta;
1390:   PetscErrorCode               ierr;
1391:   cusparseStatus_t             stat;

1394:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1395:   MatSeqAIJCUSPARSECopyToGPU(A);
1396:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1397:   try {
1398:     VecCUDAGetArrayRead(xx,&xarray);
1399:     VecCUDAGetArrayReadWrite(zz,&zarray);
1400:     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1401:     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;

1403:     /* csr_spmv is multiply add */
1404:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1405:       /* here we need to be careful to set the number of rows in the multiply to the
1406:          number of compressed rows in the matrix ... which is equivalent to the
1407:          size of the workVector */
1408:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1409:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1410:                                mat->num_rows, mat->num_cols,
1411:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1412:                                mat->values->data().get(), mat->row_offsets->data().get(),
1413:                                mat->column_indices->data().get(), xarray, beta,
1414:                                dptr);CHKERRCUDA(stat);
1415:     } else {
1416: #if CUDA_VERSION>=4020
1417:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1418:       if (cusparsestruct->workVector->size()) {
1419:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420:                                  matstruct->alpha, matstruct->descr, hybMat,
1421:                                  xarray, beta,
1422:                                  dptr);CHKERRCUDA(stat);
1423:       }
1424: #endif
1425:     }

1427:     if (yy) {
1428:       if (dptr != zarray) {
1429:         VecCopy_SeqCUDA(yy,zz);
1430:       } else if (zz != yy) {
1431:         VecAXPY_SeqCUDA(zz,1.0,yy);
1432:       }
1433:     } else if (dptr != zarray) {
1434:       VecSet(zz,0);
1435:     }
1436:     /* scatter the data from the temporary into the full vector with a += operation */
1437:     if (dptr != zarray) {
1438:       thrust::device_ptr<PetscScalar> zptr;

1440:       zptr = thrust::device_pointer_cast(zarray);
1441:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1442:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1443:                        VecCUDAPlusEquals());
1444:     }
1445:     VecCUDARestoreArrayRead(xx,&xarray);
1446:     VecCUDARestoreArrayReadWrite(zz,&zarray);
1447:   } catch(char *ex) {
1448:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1449:   }
1450:   if (!yy) { /* MatMult */
1451:     if (!cusparsestruct->stream) {
1452:       WaitForGPU();CHKERRCUDA(ierr);
1453:     }
1454:   }
1455:   PetscLogFlops(2.0*a->nz);
1456:   return(0);
1457: }

1459: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1460: {
1461:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1462:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1463:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1464:   thrust::device_ptr<PetscScalar> zptr;
1465:   const PetscScalar               *xarray;
1466:   PetscScalar                     *zarray;
1467:   PetscErrorCode                  ierr;
1468:   cusparseStatus_t                stat;

1471:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1472:   MatSeqAIJCUSPARSECopyToGPU(A);
1473:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1474:   if (!matstructT) {
1475:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1476:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1477:   }

1479:   try {
1480:     VecCopy_SeqCUDA(yy,zz);
1481:     VecCUDAGetArrayRead(xx,&xarray);
1482:     VecCUDAGetArrayReadWrite(zz,&zarray);
1483:     zptr = thrust::device_pointer_cast(zarray);

1485:     /* multiply add with matrix transpose */
1486:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1487:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1488:       /* here we need to be careful to set the number of rows in the multiply to the
1489:          number of compressed rows in the matrix ... which is equivalent to the
1490:          size of the workVector */
1491:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1492:                                mat->num_rows, mat->num_cols,
1493:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1494:                                mat->values->data().get(), mat->row_offsets->data().get(),
1495:                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1496:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1497:     } else {
1498: #if CUDA_VERSION>=4020
1499:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1500:       if (cusparsestruct->workVector->size()) {
1501:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1502:             matstructT->alpha, matstructT->descr, hybMat,
1503:             xarray, matstructT->beta_zero,
1504:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1505:       }
1506: #endif
1507:     }

1509:     /* scatter the data from the temporary into the full vector with a += operation */
1510:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1511:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1512:         VecCUDAPlusEquals());

1514:     VecCUDARestoreArrayRead(xx,&xarray);
1515:     VecCUDARestoreArrayReadWrite(zz,&zarray);

1517:   } catch(char *ex) {
1518:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1519:   }
1520:   WaitForGPU();CHKERRCUDA(ierr);
1521:   PetscLogFlops(2.0*a->nz);
1522:   return(0);
1523: }

1525: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1526: {

1530:   MatAssemblyEnd_SeqAIJ(A,mode);
1531:   if (A->factortype==MAT_FACTOR_NONE) {
1532:     MatSeqAIJCUSPARSECopyToGPU(A);
1533:   }
1534:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1535:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1536:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1537:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1538:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1539:   return(0);
1540: }

1542: /* --------------------------------------------------------------------------------*/
1543: /*@
1544:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1545:    (the default parallel PETSc format). This matrix will ultimately pushed down
1546:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1547:    assembly performance the user should preallocate the matrix storage by setting
1548:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1549:    performance during matrix assembly can be increased by more than a factor of 50.

1551:    Collective on MPI_Comm

1553:    Input Parameters:
1554: +  comm - MPI communicator, set to PETSC_COMM_SELF
1555: .  m - number of rows
1556: .  n - number of columns
1557: .  nz - number of nonzeros per row (same for all rows)
1558: -  nnz - array containing the number of nonzeros in the various rows
1559:          (possibly different for each row) or NULL

1561:    Output Parameter:
1562: .  A - the matrix

1564:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1565:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1566:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1568:    Notes:
1569:    If nnz is given then nz is ignored

1571:    The AIJ format (also called the Yale sparse matrix format or
1572:    compressed row storage), is fully compatible with standard Fortran 77
1573:    storage.  That is, the stored row and column indices can begin at
1574:    either one (as in Fortran) or zero.  See the users' manual for details.

1576:    Specify the preallocated storage with either nz or nnz (not both).
1577:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1578:    allocation.  For large problems you MUST preallocate memory or you
1579:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1581:    By default, this format uses inodes (identical nodes) when possible, to
1582:    improve numerical efficiency of matrix-vector products and solves. We
1583:    search for consecutive rows with the same nonzero structure, thereby
1584:    reusing matrix information to achieve increased efficiency.

1586:    Level: intermediate

1588: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1589: @*/
1590: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1591: {

1595:   MatCreate(comm,A);
1596:   MatSetSizes(*A,m,n,m,n);
1597:   MatSetType(*A,MATSEQAIJCUSPARSE);
1598:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1599:   return(0);
1600: }

1602: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1603: {
1604:   PetscErrorCode   ierr;

1607:   if (A->factortype==MAT_FACTOR_NONE) {
1608:     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1609:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1610:     }
1611:   } else {
1612:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1613:   }
1614:   MatDestroy_SeqAIJ(A);
1615:   return(0);
1616: }

1618: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1619: {
1621:   Mat C;
1622:   cusparseStatus_t stat;
1623:   cusparseHandle_t handle=0;

1626:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1627:   C    = *B;
1628:   PetscFree(C->defaultvectype);
1629:   PetscStrallocpy(VECCUDA,&C->defaultvectype);

1631:   /* inject CUSPARSE-specific stuff */
1632:   if (C->factortype==MAT_FACTOR_NONE) {
1633:     /* you cannot check the inode.use flag here since the matrix was just created.
1634:        now build a GPU matrix data structure */
1635:     C->spptr = new Mat_SeqAIJCUSPARSE;
1636:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1637:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1638:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1639:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1640:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1641:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1642:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1643:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1644:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1645:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1646:   } else {
1647:     /* NEXT, set the pointers to the triangular factors */
1648:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1649:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1650:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1651:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1652:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1653:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1654:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1655:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1656:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1657:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1658:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1659:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1660:   }

1662:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1663:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1664:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1665:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1666:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1667:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1668:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1669:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1671:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1673:   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1675:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1676:   return(0);
1677: }

1679: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1680: {
1682:   cusparseStatus_t stat;
1683:   cusparseHandle_t handle=0;

1686:   PetscFree(B->defaultvectype);
1687:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1689:   if (B->factortype==MAT_FACTOR_NONE) {
1690:     /* you cannot check the inode.use flag here since the matrix was just created.
1691:        now build a GPU matrix data structure */
1692:     B->spptr = new Mat_SeqAIJCUSPARSE;
1693:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1694:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1695:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1696:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1697:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1698:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1699:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1700:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1701:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1702:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1703:   } else {
1704:     /* NEXT, set the pointers to the triangular factors */
1705:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1706:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1707:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1708:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1709:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1710:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1711:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1712:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1713:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1714:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1715:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1716:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1717:   }

1719:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1720:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1721:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1722:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1723:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1724:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1725:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1726:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1728:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1730:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1732:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1733:   return(0);
1734: }

1736: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1737: {

1741:   MatCreate_SeqAIJ(B);
1742:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1743:   return(0);
1744: }

1746: /*MC
1747:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1749:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1750:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1751:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1753:    Options Database Keys:
1754: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1755: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1756: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1758:   Level: beginner

1760: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1761: M*/

1763: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1766: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1767: {

1771:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1772:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1773:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1774:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1775:   return(0);
1776: }


1779: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1780: {
1781:   cusparseStatus_t stat;
1782:   cusparseHandle_t handle;

1785:   if (*cusparsestruct) {
1786:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1787:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1788:     delete (*cusparsestruct)->workVector;
1789:     if (handle = (*cusparsestruct)->handle) {
1790:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1791:     }
1792:     delete *cusparsestruct;
1793:     *cusparsestruct = 0;
1794:   }
1795:   return(0);
1796: }

1798: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1799: {
1801:   if (*mat) {
1802:     delete (*mat)->values;
1803:     delete (*mat)->column_indices;
1804:     delete (*mat)->row_offsets;
1805:     delete *mat;
1806:     *mat = 0;
1807:   }
1808:   return(0);
1809: }

1811: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1812: {
1813:   cusparseStatus_t stat;
1814:   PetscErrorCode   ierr;

1817:   if (*trifactor) {
1818:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1819:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1820:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1821:     delete *trifactor;
1822:     *trifactor = 0;
1823:   }
1824:   return(0);
1825: }

1827: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1828: {
1829:   CsrMatrix        *mat;
1830:   cusparseStatus_t stat;
1831:   cudaError_t      err;

1834:   if (*matstruct) {
1835:     if ((*matstruct)->mat) {
1836:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1837:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1838:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1839:       } else {
1840:         mat = (CsrMatrix*)(*matstruct)->mat;
1841:         CsrMatrix_Destroy(&mat);
1842:       }
1843:     }
1844:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1845:     delete (*matstruct)->cprowIndices;
1846:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1847:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1848:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1849:     delete *matstruct;
1850:     *matstruct = 0;
1851:   }
1852:   return(0);
1853: }

1855: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1856: {
1857:   cusparseHandle_t handle;
1858:   cusparseStatus_t stat;

1861:   if (*trifactors) {
1862:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1863:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1864:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1865:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1866:     delete (*trifactors)->rpermIndices;
1867:     delete (*trifactors)->cpermIndices;
1868:     delete (*trifactors)->workVector;
1869:     if (handle = (*trifactors)->handle) {
1870:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1871:     }
1872:     delete *trifactors;
1873:     *trifactors = 0;
1874:   }
1875:   return(0);
1876: }