Actual source code: aijcusparse.cu

petsc-3.8.4 2018-03-24
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <../src/mat/impls/aij/seq/aij.h>
  9:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 36: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 37: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 38: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 39: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 42: {
 43:   cusparseStatus_t   stat;
 44:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 47:   cusparsestruct->stream = stream;
 48:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 49:   return(0);
 50: }

 52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 53: {
 54:   cusparseStatus_t   stat;
 55:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 58:   if (cusparsestruct->handle != handle) {
 59:     if (cusparsestruct->handle) {
 60:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 61:     }
 62:     cusparsestruct->handle = handle;
 63:   }
 64:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 65:   return(0);
 66: }

 68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 69: {
 70:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 72:   if (cusparsestruct->handle)
 73:     cusparsestruct->handle = 0;
 74:   return(0);
 75: }

 77: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
 78: {
 80:   *type = MATSOLVERCUSPARSE;
 81:   return(0);
 82: }

 84: /*MC
 85:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 86:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 87:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 88:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 89:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 90:   algorithms are not recommended. This class does NOT support direct solver operations.

 92:   Level: beginner

 94: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 95: M*/

 97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
 98: {
100:   PetscInt       n = A->rmap->n;

103:   MatCreate(PetscObjectComm((PetscObject)A),B);
104:   (*B)->factortype = ftype;
105:   MatSetSizes(*B,n,n,n,n);
106:   MatSetType(*B,MATSEQAIJCUSPARSE);

108:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109:     MatSetBlockSizesFromMats(*B,A,A);
110:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

117:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
119:   return(0);
120: }

122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

127: #if CUDA_VERSION>=4020
128:   switch (op) {
129:   case MAT_CUSPARSE_MULT:
130:     cusparsestruct->format = format;
131:     break;
132:   case MAT_CUSPARSE_ALL:
133:     cusparsestruct->format = format;
134:     break;
135:   default:
136:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137:   }
138: #else
139:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141:   return(0);
142: }

144: /*@
145:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146:    operation. Only the MatMult operation can use different GPU storage formats
147:    for MPIAIJCUSPARSE matrices.
148:    Not Collective

150:    Input Parameters:
151: +  A - Matrix of type SEQAIJCUSPARSE
152: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

155:    Output Parameter:

157:    Level: intermediate

159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {

167:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168:   return(0);
169: }

171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173:   PetscErrorCode           ierr;
174:   MatCUSPARSEStorageFormat format;
175:   PetscBool                flg;
176:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

179:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180:   PetscObjectOptionsBegin((PetscObject)A);
181:   if (A->factortype==MAT_FACTOR_NONE) {
182:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
184:     if (flg) {
185:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
186:     }
187:   }
188:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
190:   if (flg) {
191:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
192:   }
193:   PetscOptionsEnd();
194:   return(0);

196: }

198: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199: {

203:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
204:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205:   return(0);
206: }

208: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209: {

213:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
214:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215:   return(0);
216: }

218: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219: {

223:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
224:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225:   return(0);
226: }

228: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229: {

233:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
234:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235:   return(0);
236: }

238: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239: {
240:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
241:   PetscInt                          n = A->rmap->n;
242:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244:   cusparseStatus_t                  stat;
245:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
246:   const MatScalar                   *aa = a->a,*v;
247:   PetscInt                          *AiLo, *AjLo;
248:   PetscScalar                       *AALo;
249:   PetscInt                          i,nz, nzLower, offset, rowOffset;
250:   PetscErrorCode                    ierr;

253:   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
254:     try {
255:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256:       nzLower=n+ai[n]-ai[1];

258:       /* Allocate Space for the lower triangular matrix */
259:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

263:       /* Fill the lower triangular matrix */
264:       AiLo[0]  = (PetscInt) 0;
265:       AiLo[n]  = nzLower;
266:       AjLo[0]  = (PetscInt) 0;
267:       AALo[0]  = (MatScalar) 1.0;
268:       v        = aa;
269:       vi       = aj;
270:       offset   = 1;
271:       rowOffset= 1;
272:       for (i=1; i<n; i++) {
273:         nz = ai[i+1] - ai[i];
274:         /* additional 1 for the term on the diagonal */
275:         AiLo[i]    = rowOffset;
276:         rowOffset += nz+1;

278:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
279:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

281:         offset      += nz;
282:         AjLo[offset] = (PetscInt) i;
283:         AALo[offset] = (MatScalar) 1.0;
284:         offset      += 1;

286:         v  += nz;
287:         vi += nz;
288:       }

290:       /* allocate space for the triangular factor information */
291:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

293:       /* Create the matrix description */
294:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

300:       /* Create the solve analysis information */
301:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

303:       /* set the operation */
304:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

306:       /* set the matrix */
307:       loTriFactor->csrMat = new CsrMatrix;
308:       loTriFactor->csrMat->num_rows = n;
309:       loTriFactor->csrMat->num_cols = n;
310:       loTriFactor->csrMat->num_entries = nzLower;

312:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

315:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

318:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

321:       /* perform the solve analysis */
322:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

327:       /* assign the pointer. Is this really necessary? */
328:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

330:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
333:     } catch(char *ex) {
334:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335:     }
336:   }
337:   return(0);
338: }

340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343:   PetscInt                          n = A->rmap->n;
344:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346:   cusparseStatus_t                  stat;
347:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348:   const MatScalar                   *aa = a->a,*v;
349:   PetscInt                          *AiUp, *AjUp;
350:   PetscScalar                       *AAUp;
351:   PetscInt                          i,nz, nzUpper, offset;
352:   PetscErrorCode                    ierr;

355:   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
356:     try {
357:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358:       nzUpper = adiag[0]-adiag[n];

360:       /* Allocate Space for the upper triangular matrix */
361:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

365:       /* Fill the upper triangular matrix */
366:       AiUp[0]=(PetscInt) 0;
367:       AiUp[n]=nzUpper;
368:       offset = nzUpper;
369:       for (i=n-1; i>=0; i--) {
370:         v  = aa + adiag[i+1] + 1;
371:         vi = aj + adiag[i+1] + 1;

373:         /* number of elements NOT on the diagonal */
374:         nz = adiag[i] - adiag[i+1]-1;

376:         /* decrement the offset */
377:         offset -= (nz+1);

379:         /* first, set the diagonal elements */
380:         AjUp[offset] = (PetscInt) i;
381:         AAUp[offset] = (MatScalar)1./v[nz];
382:         AiUp[i]      = AiUp[i+1] - (nz+1);

384:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
385:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
386:       }

388:       /* allocate space for the triangular factor information */
389:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

391:       /* Create the matrix description */
392:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

398:       /* Create the solve analysis information */
399:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

401:       /* set the operation */
402:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

404:       /* set the matrix */
405:       upTriFactor->csrMat = new CsrMatrix;
406:       upTriFactor->csrMat->num_rows = n;
407:       upTriFactor->csrMat->num_cols = n;
408:       upTriFactor->csrMat->num_entries = nzUpper;

410:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

413:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

416:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

419:       /* perform the solve analysis */
420:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

425:       /* assign the pointer. Is this really necessary? */
426:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

428:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431:     } catch(char *ex) {
432:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433:     }
434:   }
435:   return(0);
436: }

438: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439: {
440:   PetscErrorCode               ierr;
441:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443:   IS                           isrow = a->row,iscol = a->icol;
444:   PetscBool                    row_identity,col_identity;
445:   const PetscInt               *r,*c;
446:   PetscInt                     n = A->rmap->n;

449:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
450:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

452:   cusparseTriFactors->workVector = new THRUSTARRAY;
453:   cusparseTriFactors->workVector->resize(n);
454:   cusparseTriFactors->nnz=a->nz;

456:   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
457:   /*lower triangular indices */
458:   ISGetIndices(isrow,&r);
459:   ISIdentity(isrow,&row_identity);
460:   if (!row_identity) {
461:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462:     cusparseTriFactors->rpermIndices->assign(r, r+n);
463:   }
464:   ISRestoreIndices(isrow,&r);

466:   /*upper triangular indices */
467:   ISGetIndices(iscol,&c);
468:   ISIdentity(iscol,&col_identity);
469:   if (!col_identity) {
470:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471:     cusparseTriFactors->cpermIndices->assign(c, c+n);
472:   }
473:   ISRestoreIndices(iscol,&c);
474:   return(0);
475: }

477: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
478: {
479:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
480:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
481:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
482:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
483:   cusparseStatus_t                  stat;
484:   PetscErrorCode                    ierr;
485:   PetscInt                          *AiUp, *AjUp;
486:   PetscScalar                       *AAUp;
487:   PetscScalar                       *AALo;
488:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
489:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
490:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
491:   const MatScalar                   *aa = b->a,*v;

494:   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
495:     try {
496:       /* Allocate Space for the upper triangular matrix */
497:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
498:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
499:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

502:       /* Fill the upper triangular matrix */
503:       AiUp[0]=(PetscInt) 0;
504:       AiUp[n]=nzUpper;
505:       offset = 0;
506:       for (i=0; i<n; i++) {
507:         /* set the pointers */
508:         v  = aa + ai[i];
509:         vj = aj + ai[i];
510:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

512:         /* first, set the diagonal elements */
513:         AjUp[offset] = (PetscInt) i;
514:         AAUp[offset] = (MatScalar)1.0/v[nz];
515:         AiUp[i]      = offset;
516:         AALo[offset] = (MatScalar)1.0/v[nz];

518:         offset+=1;
519:         if (nz>0) {
520:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
521:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
522:           for (j=offset; j<offset+nz; j++) {
523:             AAUp[j] = -AAUp[j];
524:             AALo[j] = AAUp[j]/v[nz];
525:           }
526:           offset+=nz;
527:         }
528:       }

530:       /* allocate space for the triangular factor information */
531:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

533:       /* Create the matrix description */
534:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
535:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
536:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
537:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
538:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

540:       /* Create the solve analysis information */
541:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

543:       /* set the operation */
544:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

546:       /* set the matrix */
547:       upTriFactor->csrMat = new CsrMatrix;
548:       upTriFactor->csrMat->num_rows = A->rmap->n;
549:       upTriFactor->csrMat->num_cols = A->cmap->n;
550:       upTriFactor->csrMat->num_entries = a->nz;

552:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
553:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

555:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
556:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

558:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
559:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

561:       /* perform the solve analysis */
562:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
563:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
564:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
565:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

567:       /* assign the pointer. Is this really necessary? */
568:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

570:       /* allocate space for the triangular factor information */
571:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

573:       /* Create the matrix description */
574:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
575:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
576:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
577:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
578:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

580:       /* Create the solve analysis information */
581:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

583:       /* set the operation */
584:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

586:       /* set the matrix */
587:       loTriFactor->csrMat = new CsrMatrix;
588:       loTriFactor->csrMat->num_rows = A->rmap->n;
589:       loTriFactor->csrMat->num_cols = A->cmap->n;
590:       loTriFactor->csrMat->num_entries = a->nz;

592:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
593:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

595:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
596:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

598:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
599:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

601:       /* perform the solve analysis */
602:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
603:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
604:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
605:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

607:       /* assign the pointer. Is this really necessary? */
608:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

610:       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
611:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
612:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
613:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
614:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
615:     } catch(char *ex) {
616:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
617:     }
618:   }
619:   return(0);
620: }

622: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
623: {
624:   PetscErrorCode               ierr;
625:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
626:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627:   IS                           ip = a->row;
628:   const PetscInt               *rip;
629:   PetscBool                    perm_identity;
630:   PetscInt                     n = A->rmap->n;

633:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
634:   cusparseTriFactors->workVector = new THRUSTARRAY;
635:   cusparseTriFactors->workVector->resize(n);
636:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

638:   /*lower triangular indices */
639:   ISGetIndices(ip,&rip);
640:   ISIdentity(ip,&perm_identity);
641:   if (!perm_identity) {
642:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
644:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
645:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
646:   }
647:   ISRestoreIndices(ip,&rip);
648:   return(0);
649: }

651: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
652: {
653:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
654:   IS             isrow = b->row,iscol = b->col;
655:   PetscBool      row_identity,col_identity;

659:   MatLUFactorNumeric_SeqAIJ(B,A,info);
660:   /* determine which version of MatSolve needs to be used. */
661:   ISIdentity(isrow,&row_identity);
662:   ISIdentity(iscol,&col_identity);
663:   if (row_identity && col_identity) {
664:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
665:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
666:   } else {
667:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
668:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
669:   }

671:   /* get the triangular factors */
672:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
673:   return(0);
674: }

676: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677: {
678:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
679:   IS             ip = b->row;
680:   PetscBool      perm_identity;

684:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

686:   /* determine which version of MatSolve needs to be used. */
687:   ISIdentity(ip,&perm_identity);
688:   if (perm_identity) {
689:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691:   } else {
692:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694:   }

696:   /* get the triangular factors */
697:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
698:   return(0);
699: }

701: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
702: {
703:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
707:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
708:   cusparseStatus_t                  stat;
709:   cusparseIndexBase_t               indexBase;
710:   cusparseMatrixType_t              matrixType;
711:   cusparseFillMode_t                fillMode;
712:   cusparseDiagType_t                diagType;


716:   /*********************************************/
717:   /* Now the Transpose of the Lower Tri Factor */
718:   /*********************************************/

720:   /* allocate space for the transpose of the lower triangular factor */
721:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

723:   /* set the matrix descriptors of the lower triangular factor */
724:   matrixType = cusparseGetMatType(loTriFactor->descr);
725:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
726:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
727:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
728:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

730:   /* Create the matrix description */
731:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
732:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
733:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
734:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
735:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

737:   /* Create the solve analysis information */
738:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

740:   /* set the operation */
741:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

743:   /* allocate GPU space for the CSC of the lower triangular factor*/
744:   loTriFactorT->csrMat = new CsrMatrix;
745:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
746:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
747:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
748:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
749:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
750:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

752:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
753:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
754:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
755:                           loTriFactor->csrMat->values->data().get(),
756:                           loTriFactor->csrMat->row_offsets->data().get(),
757:                           loTriFactor->csrMat->column_indices->data().get(),
758:                           loTriFactorT->csrMat->values->data().get(),
759:                           loTriFactorT->csrMat->column_indices->data().get(),
760:                           loTriFactorT->csrMat->row_offsets->data().get(),
761:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

763:   /* perform the solve analysis on the transposed matrix */
764:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
765:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
766:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
767:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
768:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

770:   /* assign the pointer. Is this really necessary? */
771:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

773:   /*********************************************/
774:   /* Now the Transpose of the Upper Tri Factor */
775:   /*********************************************/

777:   /* allocate space for the transpose of the upper triangular factor */
778:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

780:   /* set the matrix descriptors of the upper triangular factor */
781:   matrixType = cusparseGetMatType(upTriFactor->descr);
782:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
783:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
784:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
785:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

787:   /* Create the matrix description */
788:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
789:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
790:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
791:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
792:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

794:   /* Create the solve analysis information */
795:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

797:   /* set the operation */
798:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

800:   /* allocate GPU space for the CSC of the upper triangular factor*/
801:   upTriFactorT->csrMat = new CsrMatrix;
802:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
803:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
804:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
805:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
806:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
807:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

809:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
810:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
811:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
812:                           upTriFactor->csrMat->values->data().get(),
813:                           upTriFactor->csrMat->row_offsets->data().get(),
814:                           upTriFactor->csrMat->column_indices->data().get(),
815:                           upTriFactorT->csrMat->values->data().get(),
816:                           upTriFactorT->csrMat->column_indices->data().get(),
817:                           upTriFactorT->csrMat->row_offsets->data().get(),
818:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

820:   /* perform the solve analysis on the transposed matrix */
821:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
822:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
823:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
824:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
825:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

827:   /* assign the pointer. Is this really necessary? */
828:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
829:   return(0);
830: }

832: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
833: {
834:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
835:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
836:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
837:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
838:   cusparseStatus_t             stat;
839:   cusparseIndexBase_t          indexBase;
840:   cudaError_t                  err;


844:   /* allocate space for the triangular factor information */
845:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
846:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
847:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
848:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
849:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

851:   /* set alpha and beta */
852:   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
853:   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854:   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
855:   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
856:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

858:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
859:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
860:     CsrMatrix *matrixT= new CsrMatrix;
861:     matrixT->num_rows = A->rmap->n;
862:     matrixT->num_cols = A->cmap->n;
863:     matrixT->num_entries = a->nz;
864:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
865:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
866:     matrixT->values = new THRUSTARRAY(a->nz);

868:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
869:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
870:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
871:                             matrix->num_cols, matrix->num_entries,
872:                             matrix->values->data().get(),
873:                             matrix->row_offsets->data().get(),
874:                             matrix->column_indices->data().get(),
875:                             matrixT->values->data().get(),
876:                             matrixT->column_indices->data().get(),
877:                             matrixT->row_offsets->data().get(),
878:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

880:     /* assign the pointer */
881:     matstructT->mat = matrixT;

883:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
884: #if CUDA_VERSION>=5000
885:     /* First convert HYB to CSR */
886:     CsrMatrix *temp= new CsrMatrix;
887:     temp->num_rows = A->rmap->n;
888:     temp->num_cols = A->cmap->n;
889:     temp->num_entries = a->nz;
890:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
891:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
892:     temp->values = new THRUSTARRAY(a->nz);


895:     stat = cusparse_hyb2csr(cusparsestruct->handle,
896:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
897:                             temp->values->data().get(),
898:                             temp->row_offsets->data().get(),
899:                             temp->column_indices->data().get());CHKERRCUDA(stat);

901:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
902:     CsrMatrix *tempT= new CsrMatrix;
903:     tempT->num_rows = A->rmap->n;
904:     tempT->num_cols = A->cmap->n;
905:     tempT->num_entries = a->nz;
906:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
908:     tempT->values = new THRUSTARRAY(a->nz);

910:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
911:                             temp->num_cols, temp->num_entries,
912:                             temp->values->data().get(),
913:                             temp->row_offsets->data().get(),
914:                             temp->column_indices->data().get(),
915:                             tempT->values->data().get(),
916:                             tempT->column_indices->data().get(),
917:                             tempT->row_offsets->data().get(),
918:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

920:     /* Last, convert CSC to HYB */
921:     cusparseHybMat_t hybMat;
922:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
923:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
924:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
925:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
926:                             matstructT->descr, tempT->values->data().get(),
927:                             tempT->row_offsets->data().get(),
928:                             tempT->column_indices->data().get(),
929:                             hybMat, 0, partition);CHKERRCUDA(stat);

931:     /* assign the pointer */
932:     matstructT->mat = hybMat;

934:     /* delete temporaries */
935:     if (tempT) {
936:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
937:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
938:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
939:       delete (CsrMatrix*) tempT;
940:     }
941:     if (temp) {
942:       if (temp->values) delete (THRUSTARRAY*) temp->values;
943:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
944:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
945:       delete (CsrMatrix*) temp;
946:     }
947: #else
948:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
949: #endif
950:   }
951:   /* assign the compressed row indices */
952:   matstructT->cprowIndices = new THRUSTINTARRAY;

954:   /* assign the pointer */
955:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
956:   return(0);
957: }

959: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
960: {
961:   PetscInt                              n = xx->map->n;
962:   const PetscScalar                     *barray;
963:   PetscScalar                           *xarray;
964:   thrust::device_ptr<const PetscScalar> bGPU;
965:   thrust::device_ptr<PetscScalar>       xGPU;
966:   cusparseStatus_t                      stat;
967:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
968:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
969:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
970:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
971:   PetscErrorCode                        ierr;

974:   /* Analyze the matrix and create the transpose ... on the fly */
975:   if (!loTriFactorT && !upTriFactorT) {
976:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
977:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
978:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
979:   }

981:   /* Get the GPU pointers */
982:   VecCUDAGetArrayWrite(xx,&xarray);
983:   VecCUDAGetArrayRead(bb,&barray);
984:   xGPU = thrust::device_pointer_cast(xarray);
985:   bGPU = thrust::device_pointer_cast(barray);

987:   /* First, reorder with the row permutation */
988:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
989:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
990:                xGPU);

992:   /* First, solve U */
993:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
994:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
995:                         upTriFactorT->csrMat->values->data().get(),
996:                         upTriFactorT->csrMat->row_offsets->data().get(),
997:                         upTriFactorT->csrMat->column_indices->data().get(),
998:                         upTriFactorT->solveInfo,
999:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1001:   /* Then, solve L */
1002:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1003:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1004:                         loTriFactorT->csrMat->values->data().get(),
1005:                         loTriFactorT->csrMat->row_offsets->data().get(),
1006:                         loTriFactorT->csrMat->column_indices->data().get(),
1007:                         loTriFactorT->solveInfo,
1008:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1010:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1011:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1012:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1013:                tempGPU->begin());

1015:   /* Copy the temporary to the full solution. */
1016:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1018:   /* restore */
1019:   VecCUDARestoreArrayRead(bb,&barray);
1020:   VecCUDARestoreArrayWrite(xx,&xarray);
1021:   WaitForGPU();CHKERRCUDA(ierr);

1023:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1024:   return(0);
1025: }

1027: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1028: {
1029:   const PetscScalar                 *barray;
1030:   PetscScalar                       *xarray;
1031:   cusparseStatus_t                  stat;
1032:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1033:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1034:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1035:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1036:   PetscErrorCode                    ierr;

1039:   /* Analyze the matrix and create the transpose ... on the fly */
1040:   if (!loTriFactorT && !upTriFactorT) {
1041:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1042:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1043:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1044:   }

1046:   /* Get the GPU pointers */
1047:   VecCUDAGetArrayWrite(xx,&xarray);
1048:   VecCUDAGetArrayRead(bb,&barray);

1050:   /* First, solve U */
1051:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1052:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1053:                         upTriFactorT->csrMat->values->data().get(),
1054:                         upTriFactorT->csrMat->row_offsets->data().get(),
1055:                         upTriFactorT->csrMat->column_indices->data().get(),
1056:                         upTriFactorT->solveInfo,
1057:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1059:   /* Then, solve L */
1060:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1061:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1062:                         loTriFactorT->csrMat->values->data().get(),
1063:                         loTriFactorT->csrMat->row_offsets->data().get(),
1064:                         loTriFactorT->csrMat->column_indices->data().get(),
1065:                         loTriFactorT->solveInfo,
1066:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1068:   /* restore */
1069:   VecCUDARestoreArrayRead(bb,&barray);
1070:   VecCUDARestoreArrayWrite(xx,&xarray);
1071:   WaitForGPU();CHKERRCUDA(ierr);
1072:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1073:   return(0);
1074: }

1076: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1077: {
1078:   const PetscScalar                     *barray;
1079:   PetscScalar                           *xarray;
1080:   thrust::device_ptr<const PetscScalar> bGPU;
1081:   thrust::device_ptr<PetscScalar>       xGPU;
1082:   cusparseStatus_t                      stat;
1083:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1084:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1085:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1086:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1087:   PetscErrorCode                        ierr;


1091:   /* Get the GPU pointers */
1092:   VecCUDAGetArrayWrite(xx,&xarray);
1093:   VecCUDAGetArrayRead(bb,&barray);
1094:   xGPU = thrust::device_pointer_cast(xarray);
1095:   bGPU = thrust::device_pointer_cast(barray);

1097:   /* First, reorder with the row permutation */
1098:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1099:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1100:                xGPU);

1102:   /* Next, solve L */
1103:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1104:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1105:                         loTriFactor->csrMat->values->data().get(),
1106:                         loTriFactor->csrMat->row_offsets->data().get(),
1107:                         loTriFactor->csrMat->column_indices->data().get(),
1108:                         loTriFactor->solveInfo,
1109:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1111:   /* Then, solve U */
1112:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1113:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1114:                         upTriFactor->csrMat->values->data().get(),
1115:                         upTriFactor->csrMat->row_offsets->data().get(),
1116:                         upTriFactor->csrMat->column_indices->data().get(),
1117:                         upTriFactor->solveInfo,
1118:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1120:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1121:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1122:                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1123:                tempGPU->begin());

1125:   /* Copy the temporary to the full solution. */
1126:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1128:   VecCUDARestoreArrayRead(bb,&barray);
1129:   VecCUDARestoreArrayWrite(xx,&xarray);
1130:   WaitForGPU();CHKERRCUDA(ierr);
1131:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1132:   return(0);
1133: }

1135: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1136: {
1137:   const PetscScalar                 *barray;
1138:   PetscScalar                       *xarray;
1139:   cusparseStatus_t                  stat;
1140:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1141:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1142:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1143:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1144:   PetscErrorCode                    ierr;

1147:   /* Get the GPU pointers */
1148:   VecCUDAGetArrayWrite(xx,&xarray);
1149:   VecCUDAGetArrayRead(bb,&barray);

1151:   /* First, solve L */
1152:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1153:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1154:                         loTriFactor->csrMat->values->data().get(),
1155:                         loTriFactor->csrMat->row_offsets->data().get(),
1156:                         loTriFactor->csrMat->column_indices->data().get(),
1157:                         loTriFactor->solveInfo,
1158:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1160:   /* Next, solve U */
1161:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1162:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1163:                         upTriFactor->csrMat->values->data().get(),
1164:                         upTriFactor->csrMat->row_offsets->data().get(),
1165:                         upTriFactor->csrMat->column_indices->data().get(),
1166:                         upTriFactor->solveInfo,
1167:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1169:   VecCUDARestoreArrayRead(bb,&barray);
1170:   VecCUDARestoreArrayWrite(xx,&xarray);
1171:   WaitForGPU();CHKERRCUDA(ierr);
1172:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1173:   return(0);
1174: }

1176: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1177: {

1179:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1180:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1181:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1182:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1183:   PetscErrorCode               ierr;
1184:   cusparseStatus_t             stat;
1185:   cudaError_t                  err;

1188:   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1189:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1190:     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1191:       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1192:       /* copy values only */
1193:       matrix->values->assign(a->a, a->a+a->nz);
1194:     } else {
1195:       Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1196:       try {
1197:         cusparsestruct->nonzerorow=0;
1198:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1200:         if (a->compressedrow.use) {
1201:           m    = a->compressedrow.nrows;
1202:           ii   = a->compressedrow.i;
1203:           ridx = a->compressedrow.rindex;
1204:         } else {
1205:           /* Forcing compressed row on the GPU */
1206:           int k=0;
1207:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1208:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1209:           ii[0]=0;
1210:           for (int j = 0; j<m; j++) {
1211:             if ((a->i[j+1]-a->i[j])>0) {
1212:               ii[k]  = a->i[j];
1213:               ridx[k]= j;
1214:               k++;
1215:             }
1216:           }
1217:           ii[cusparsestruct->nonzerorow] = a->nz;
1218:           m = cusparsestruct->nonzerorow;
1219:         }

1221:         /* allocate space for the triangular factor information */
1222:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1223:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1224:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1225:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1227:         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1228:         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229:         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1230:         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1233:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1234:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1235:           /* set the matrix */
1236:           CsrMatrix *matrix= new CsrMatrix;
1237:           matrix->num_rows = m;
1238:           matrix->num_cols = A->cmap->n;
1239:           matrix->num_entries = a->nz;
1240:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1241:           matrix->row_offsets->assign(ii, ii + m+1);

1243:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1244:           matrix->column_indices->assign(a->j, a->j+a->nz);

1246:           matrix->values = new THRUSTARRAY(a->nz);
1247:           matrix->values->assign(a->a, a->a+a->nz);

1249:           /* assign the pointer */
1250:           matstruct->mat = matrix;

1252:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1253: #if CUDA_VERSION>=4020
1254:           CsrMatrix *matrix= new CsrMatrix;
1255:           matrix->num_rows = m;
1256:           matrix->num_cols = A->cmap->n;
1257:           matrix->num_entries = a->nz;
1258:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1259:           matrix->row_offsets->assign(ii, ii + m+1);

1261:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1262:           matrix->column_indices->assign(a->j, a->j+a->nz);

1264:           matrix->values = new THRUSTARRAY(a->nz);
1265:           matrix->values->assign(a->a, a->a+a->nz);

1267:           cusparseHybMat_t hybMat;
1268:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1269:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1270:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1271:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1272:               matstruct->descr, matrix->values->data().get(),
1273:               matrix->row_offsets->data().get(),
1274:               matrix->column_indices->data().get(),
1275:               hybMat, 0, partition);CHKERRCUDA(stat);
1276:           /* assign the pointer */
1277:           matstruct->mat = hybMat;

1279:           if (matrix) {
1280:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1281:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1282:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1283:             delete (CsrMatrix*)matrix;
1284:           }
1285: #endif
1286:         }

1288:         /* assign the compressed row indices */
1289:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1290:         matstruct->cprowIndices->assign(ridx,ridx+m);

1292:         /* assign the pointer */
1293:         cusparsestruct->mat = matstruct;

1295:         if (!a->compressedrow.use) {
1296:           PetscFree(ii);
1297:           PetscFree(ridx);
1298:         }
1299:         cusparsestruct->workVector = new THRUSTARRAY;
1300:         cusparsestruct->workVector->resize(m);
1301:       } catch(char *ex) {
1302:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1303:       }
1304:       cusparsestruct->nonzerostate = A->nonzerostate;
1305:     }
1306:     WaitForGPU();CHKERRCUDA(ierr);
1307:     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1308:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1309:   }
1310:   return(0);
1311: }

1313: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1314: {
1316:   PetscInt rbs,cbs;

1319:   MatGetBlockSizes(mat,&rbs,&cbs);
1320:   if (right) {
1321:     VecCreate(PetscObjectComm((PetscObject)mat),right);
1322:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1323:     VecSetBlockSize(*right,cbs);
1324:     VecSetType(*right,VECSEQCUDA);
1325:     PetscLayoutReference(mat->cmap,&(*right)->map);
1326:   }
1327:   if (left) {
1328:     VecCreate(PetscObjectComm((PetscObject)mat),left);
1329:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1330:     VecSetBlockSize(*left,rbs);
1331:     VecSetType(*left,VECSEQCUDA);
1332:     PetscLayoutReference(mat->rmap,&(*left)->map);
1333:   }
1334:   return(0);
1335: }

1337: struct VecCUDAPlusEquals
1338: {
1339:   template <typename Tuple>
1340:   __host__ __device__
1341:   void operator()(Tuple t)
1342:   {
1343:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1344:   }
1345: };

1347: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1348: {
1349:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1350:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1351:   Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1352:   const PetscScalar            *xarray;
1353:   PetscScalar                  *yarray;
1354:   PetscErrorCode               ierr;
1355:   cusparseStatus_t             stat;

1358:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1359:   MatSeqAIJCUSPARSECopyToGPU(A);
1360:   VecCUDAGetArrayRead(xx,&xarray);
1361:   VecSet(yy,0);
1362:   VecCUDAGetArrayWrite(yy,&yarray);
1363:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1364:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1365:     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1366:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1367:                              mat->num_rows, mat->num_cols, mat->num_entries,
1368:                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1369:                              mat->column_indices->data().get(), xarray, matstruct->beta,
1370:                              yarray);CHKERRCUDA(stat);
1371:   } else {
1372: #if CUDA_VERSION>=4020
1373:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1374:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1375:                              matstruct->alpha, matstruct->descr, hybMat,
1376:                              xarray, matstruct->beta,
1377:                              yarray);CHKERRCUDA(stat);
1378: #endif
1379:   }
1380:   VecCUDARestoreArrayRead(xx,&xarray);
1381:   VecCUDARestoreArrayWrite(yy,&yarray);
1382:   if (!cusparsestruct->stream) {
1383:     WaitForGPU();CHKERRCUDA(ierr);
1384:   }
1385:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1386:   return(0);
1387: }

1389: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1390: {
1391:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1392:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1393:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1394:   const PetscScalar            *xarray;
1395:   PetscScalar                  *yarray;
1396:   PetscErrorCode               ierr;
1397:   cusparseStatus_t             stat;

1400:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1401:   MatSeqAIJCUSPARSECopyToGPU(A);
1402:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1403:   if (!matstructT) {
1404:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1405:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1406:   }
1407:   VecCUDAGetArrayRead(xx,&xarray);
1408:   VecSet(yy,0);
1409:   VecCUDAGetArrayWrite(yy,&yarray);

1411:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414:                              mat->num_rows, mat->num_cols,
1415:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1416:                              mat->values->data().get(), mat->row_offsets->data().get(),
1417:                              mat->column_indices->data().get(), xarray, matstructT->beta,
1418:                              yarray);CHKERRCUDA(stat);
1419:   } else {
1420: #if CUDA_VERSION>=4020
1421:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1422:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423:                              matstructT->alpha, matstructT->descr, hybMat,
1424:                              xarray, matstructT->beta,
1425:                              yarray);CHKERRCUDA(stat);
1426: #endif
1427:   }
1428:   VecCUDARestoreArrayRead(xx,&xarray);
1429:   VecCUDARestoreArrayWrite(yy,&yarray);
1430:   if (!cusparsestruct->stream) {
1431:     WaitForGPU();CHKERRCUDA(ierr);
1432:   }
1433:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1434:   return(0);
1435: }


1438: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1439: {
1440:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1441:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1442:   Mat_SeqAIJCUSPARSEMultStruct    *matstruct;
1443:   thrust::device_ptr<PetscScalar> zptr;
1444:   const PetscScalar               *xarray;
1445:   PetscScalar                     *zarray;
1446:   PetscErrorCode                  ierr;
1447:   cusparseStatus_t                stat;

1450:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1451:   MatSeqAIJCUSPARSECopyToGPU(A);
1452:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1453:   try {
1454:     VecCopy_SeqCUDA(yy,zz);
1455:     VecCUDAGetArrayRead(xx,&xarray);
1456:     VecCUDAGetArrayReadWrite(zz,&zarray);
1457:     zptr = thrust::device_pointer_cast(zarray);

1459:     /* multiply add */
1460:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1461:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1462:     /* here we need to be careful to set the number of rows in the multiply to the
1463:        number of compressed rows in the matrix ... which is equivalent to the
1464:        size of the workVector */
1465:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1466:                                mat->num_rows, mat->num_cols,
1467:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1468:                                mat->values->data().get(), mat->row_offsets->data().get(),
1469:                                mat->column_indices->data().get(), xarray, matstruct->beta,
1470:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1471:     } else {
1472: #if CUDA_VERSION>=4020
1473:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1474:       if (cusparsestruct->workVector->size()) {
1475:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1476:             matstruct->alpha, matstruct->descr, hybMat,
1477:             xarray, matstruct->beta,
1478:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1479:       }
1480: #endif
1481:     }

1483:     /* scatter the data from the temporary into the full vector with a += operation */
1484:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1485:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1486:         VecCUDAPlusEquals());
1487:     VecCUDARestoreArrayRead(xx,&xarray);
1488:     VecCUDARestoreArrayReadWrite(zz,&zarray);

1490:   } catch(char *ex) {
1491:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1492:   }
1493:   WaitForGPU();CHKERRCUDA(ierr);
1494:   PetscLogFlops(2.0*a->nz);
1495:   return(0);
1496: }

1498: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1499: {
1500:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1501:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1502:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1503:   thrust::device_ptr<PetscScalar> zptr;
1504:   const PetscScalar               *xarray;
1505:   PetscScalar                     *zarray;
1506:   PetscErrorCode                  ierr;
1507:   cusparseStatus_t                stat;

1510:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1511:   MatSeqAIJCUSPARSECopyToGPU(A);
1512:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1513:   if (!matstructT) {
1514:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1515:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1516:   }

1518:   try {
1519:     VecCopy_SeqCUDA(yy,zz);
1520:     VecCUDAGetArrayRead(xx,&xarray);
1521:     VecCUDAGetArrayReadWrite(zz,&zarray);
1522:     zptr = thrust::device_pointer_cast(zarray);

1524:     /* multiply add with matrix transpose */
1525:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1526:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1527:       /* here we need to be careful to set the number of rows in the multiply to the
1528:          number of compressed rows in the matrix ... which is equivalent to the
1529:          size of the workVector */
1530:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1531:                                mat->num_rows, mat->num_cols,
1532:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1533:                                mat->values->data().get(), mat->row_offsets->data().get(),
1534:                                mat->column_indices->data().get(), xarray, matstructT->beta,
1535:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1536:     } else {
1537: #if CUDA_VERSION>=4020
1538:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1539:       if (cusparsestruct->workVector->size()) {
1540:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1541:             matstructT->alpha, matstructT->descr, hybMat,
1542:             xarray, matstructT->beta,
1543:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1544:       }
1545: #endif
1546:     }

1548:     /* scatter the data from the temporary into the full vector with a += operation */
1549:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1550:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1551:         VecCUDAPlusEquals());

1553:     VecCUDARestoreArrayRead(xx,&xarray);
1554:     VecCUDARestoreArrayReadWrite(zz,&zarray);

1556:   } catch(char *ex) {
1557:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1558:   }
1559:   WaitForGPU();CHKERRCUDA(ierr);
1560:   PetscLogFlops(2.0*a->nz);
1561:   return(0);
1562: }

1564: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1565: {

1569:   MatAssemblyEnd_SeqAIJ(A,mode);
1570:   if (A->factortype==MAT_FACTOR_NONE) {
1571:     MatSeqAIJCUSPARSECopyToGPU(A);
1572:   }
1573:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1574:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1575:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1576:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1577:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1578:   return(0);
1579: }

1581: /* --------------------------------------------------------------------------------*/
1582: /*@
1583:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1584:    (the default parallel PETSc format). This matrix will ultimately pushed down
1585:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1586:    assembly performance the user should preallocate the matrix storage by setting
1587:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1588:    performance during matrix assembly can be increased by more than a factor of 50.

1590:    Collective on MPI_Comm

1592:    Input Parameters:
1593: +  comm - MPI communicator, set to PETSC_COMM_SELF
1594: .  m - number of rows
1595: .  n - number of columns
1596: .  nz - number of nonzeros per row (same for all rows)
1597: -  nnz - array containing the number of nonzeros in the various rows
1598:          (possibly different for each row) or NULL

1600:    Output Parameter:
1601: .  A - the matrix

1603:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1604:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1605:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1607:    Notes:
1608:    If nnz is given then nz is ignored

1610:    The AIJ format (also called the Yale sparse matrix format or
1611:    compressed row storage), is fully compatible with standard Fortran 77
1612:    storage.  That is, the stored row and column indices can begin at
1613:    either one (as in Fortran) or zero.  See the users' manual for details.

1615:    Specify the preallocated storage with either nz or nnz (not both).
1616:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1617:    allocation.  For large problems you MUST preallocate memory or you
1618:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1620:    By default, this format uses inodes (identical nodes) when possible, to
1621:    improve numerical efficiency of matrix-vector products and solves. We
1622:    search for consecutive rows with the same nonzero structure, thereby
1623:    reusing matrix information to achieve increased efficiency.

1625:    Level: intermediate

1627: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1628: @*/
1629: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1630: {

1634:   MatCreate(comm,A);
1635:   MatSetSizes(*A,m,n,m,n);
1636:   MatSetType(*A,MATSEQAIJCUSPARSE);
1637:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1638:   return(0);
1639: }

1641: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1642: {
1643:   PetscErrorCode   ierr;

1646:   if (A->factortype==MAT_FACTOR_NONE) {
1647:     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1648:       Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1649:     }
1650:   } else {
1651:     Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1652:   }
1653:   MatDestroy_SeqAIJ(A);
1654:   return(0);
1655: }

1657: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1658: {
1660:   Mat C;
1661:   cusparseStatus_t stat;
1662:   cusparseHandle_t handle=0;

1665:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1666:   C = *B;
1667:   /* inject CUSPARSE-specific stuff */
1668:   if (C->factortype==MAT_FACTOR_NONE) {
1669:     /* you cannot check the inode.use flag here since the matrix was just created.
1670:        now build a GPU matrix data structure */
1671:     C->spptr = new Mat_SeqAIJCUSPARSE;
1672:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1673:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1674:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1675:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1676:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1677:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1678:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1679:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1680:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1681:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1682:   } else {
1683:     /* NEXT, set the pointers to the triangular factors */
1684:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1685:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1686:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1687:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1688:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1689:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1690:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1691:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1692:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1693:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1694:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1695:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1696:   }

1698:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1699:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1700:   C->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1701:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1702:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1703:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1704:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1705:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1706:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1708:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1710:   C->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;

1712:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1713:   return(0);
1714: }

1716: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1717: {
1719:   cusparseStatus_t stat;
1720:   cusparseHandle_t handle=0;

1723:   MatCreate_SeqAIJ(B);
1724:   if (B->factortype==MAT_FACTOR_NONE) {
1725:     /* you cannot check the inode.use flag here since the matrix was just created.
1726:        now build a GPU matrix data structure */
1727:     B->spptr = new Mat_SeqAIJCUSPARSE;
1728:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1729:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1730:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1731:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1732:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1733:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1734:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1735:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1736:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1737:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1738:   } else {
1739:     /* NEXT, set the pointers to the triangular factors */
1740:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1741:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1742:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1743:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1744:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1745:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1746:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1747:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1748:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1749:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1750:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1751:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1752:   }

1754:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1755:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1756:   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1757:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1758:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1759:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1760:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1761:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1762:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1764:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1766:   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;

1768:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1769:   return(0);
1770: }

1772: /*M
1773:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1775:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1776:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1777:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1779:    Options Database Keys:
1780: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1781: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1784:   Level: beginner

1786: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1787: M*/

1789: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1792: PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1793: {

1797:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1798:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1799:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1800:   MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1801:   return(0);
1802: }


1805: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1806: {
1807:   cusparseStatus_t stat;
1808:   cusparseHandle_t handle;

1811:   if (*cusparsestruct) {
1812:     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1813:     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1814:     delete (*cusparsestruct)->workVector;
1815:     if (handle = (*cusparsestruct)->handle) {
1816:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1817:     }
1818:     delete *cusparsestruct;
1819:     *cusparsestruct = 0;
1820:   }
1821:   return(0);
1822: }

1824: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1825: {
1827:   if (*mat) {
1828:     delete (*mat)->values;
1829:     delete (*mat)->column_indices;
1830:     delete (*mat)->row_offsets;
1831:     delete *mat;
1832:     *mat = 0;
1833:   }
1834:   return(0);
1835: }

1837: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1838: {
1839:   cusparseStatus_t stat;
1840:   PetscErrorCode   ierr;

1843:   if (*trifactor) {
1844:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1845:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1846:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1847:     delete *trifactor;
1848:     *trifactor = 0;
1849:   }
1850:   return(0);
1851: }

1853: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1854: {
1855:   CsrMatrix        *mat;
1856:   cusparseStatus_t stat;
1857:   cudaError_t      err;

1860:   if (*matstruct) {
1861:     if ((*matstruct)->mat) {
1862:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1863:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1864:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1865:       } else {
1866:         mat = (CsrMatrix*)(*matstruct)->mat;
1867:         CsrMatrix_Destroy(&mat);
1868:       }
1869:     }
1870:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1871:     delete (*matstruct)->cprowIndices;
1872:     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1873:     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1874:     delete *matstruct;
1875:     *matstruct = 0;
1876:   }
1877:   return(0);
1878: }

1880: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1881: {
1882:   cusparseHandle_t handle;
1883:   cusparseStatus_t stat;

1886:   if (*trifactors) {
1887:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1888:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1889:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1890:     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1891:     delete (*trifactors)->rpermIndices;
1892:     delete (*trifactors)->cpermIndices;
1893:     delete (*trifactors)->workVector;
1894:     if (handle = (*trifactors)->handle) {
1895:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1896:     }
1897:     delete *trifactors;
1898:     *trifactors = 0;
1899:   }
1900:   return(0);
1901: }