Actual source code: aijcusparse.cu

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <../src/mat/impls/aij/seq/aij.h>
  9:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 36: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 38: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 39: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 42: {
 43:   cusparseStatus_t   stat;
 44:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 47:   cusparsestruct->stream = stream;
 48:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 49:   return(0);
 50: }

 52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 53: {
 54:   cusparseStatus_t   stat;
 55:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 58:   if (cusparsestruct->handle != handle) {
 59:     if (cusparsestruct->handle) {
 60:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 61:     }
 62:     cusparsestruct->handle = handle;
 63:   }
 64:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 65:   return(0);
 66: }

 68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 69: {
 70:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 72:   if (cusparsestruct->handle)
 73:     cusparsestruct->handle = 0;
 74:   return(0);
 75: }

 77: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 78: {
 80:   *type = MATSOLVERCUSPARSE;
 81:   return(0);
 82: }

 84: /*MC
 85:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 86:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 87:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 88:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 89:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 90:   algorithms are not recommended. This class does NOT support direct solver operations.

 92:   Level: beginner

 94: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 95: M*/

 97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
 98: {
100:   PetscInt       n = A->rmap->n;

103:   MatCreate(PetscObjectComm((PetscObject)A),B);
104:   (*B)->factortype = ftype;
105:   MatSetSizes(*B,n,n,n,n);
106:   MatSetType(*B,MATSEQAIJCUSPARSE);

108:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109:     MatSetBlockSizesFromMats(*B,A,A);
110:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

117:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
119:   return(0);
120: }

122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

127: #if CUDA_VERSION>=4020
128:   switch (op) {
129:   case MAT_CUSPARSE_MULT:
130:     cusparsestruct->format = format;
131:     break;
132:   case MAT_CUSPARSE_ALL:
133:     cusparsestruct->format = format;
134:     break;
135:   default:
136:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137:   }
138: #else
139:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141:   return(0);
142: }

144: /*@
145:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146:    operation. Only the MatMult operation can use different GPU storage formats
147:    for MPIAIJCUSPARSE matrices.
148:    Not Collective

150:    Input Parameters:
151: +  A - Matrix of type SEQAIJCUSPARSE
152: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

155:    Output Parameter:

157:    Level: intermediate

159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {

167:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168:   return(0);
169: }

171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173:   PetscErrorCode           ierr;
174:   MatCUSPARSEStorageFormat format;
175:   PetscBool                flg;
176:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

179:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180:   PetscObjectOptionsBegin((PetscObject)A);
181:   if (A->factortype==MAT_FACTOR_NONE) {
182:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
184:     if (flg) {
185:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
186:     }
187:   }
188:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
190:   if (flg) {
191:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
192:   }
193:   PetscOptionsEnd();
194:   return(0);

196: }

198: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199: {

203:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
204:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205:   return(0);
206: }

208: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209: {

213:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
214:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215:   return(0);
216: }

218: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219: {

223:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
224:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225:   return(0);
226: }

228: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229: {

233:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
234:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235:   return(0);
236: }

238: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239: {
240:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
241:   PetscInt                          n = A->rmap->n;
242:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244:   cusparseStatus_t                  stat;
245:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
246:   const MatScalar                   *aa = a->a,*v;
247:   PetscInt                          *AiLo, *AjLo;
248:   PetscScalar                       *AALo;
249:   PetscInt                          i,nz, nzLower, offset, rowOffset;
250:   PetscErrorCode                    ierr;

253:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
254:     try {
255:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256:       nzLower=n+ai[n]-ai[1];

258:       /* Allocate Space for the lower triangular matrix */
259:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

263:       /* Fill the lower triangular matrix */
264:       AiLo[0]  = (PetscInt) 0;
265:       AiLo[n]  = nzLower;
266:       AjLo[0]  = (PetscInt) 0;
267:       AALo[0]  = (MatScalar) 1.0;
268:       v        = aa;
269:       vi       = aj;
270:       offset   = 1;
271:       rowOffset= 1;
272:       for (i=1; i<n; i++) {
273:         nz = ai[i+1] - ai[i];
274:         /* additional 1 for the term on the diagonal */
275:         AiLo[i]    = rowOffset;
276:         rowOffset += nz+1;

278:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
279:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

281:         offset      += nz;
282:         AjLo[offset] = (PetscInt) i;
283:         AALo[offset] = (MatScalar) 1.0;
284:         offset      += 1;

286:         v  += nz;
287:         vi += nz;
288:       }

290:       /* allocate space for the triangular factor information */
291:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

293:       /* Create the matrix description */
294:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

300:       /* Create the solve analysis information */
301:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

303:       /* set the operation */
304:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

306:       /* set the matrix */
307:       loTriFactor->csrMat = new CsrMatrix;
308:       loTriFactor->csrMat->num_rows = n;
309:       loTriFactor->csrMat->num_cols = n;
310:       loTriFactor->csrMat->num_entries = nzLower;

312:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

315:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

318:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

321:       /* perform the solve analysis */
322:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

327:       /* assign the pointer. Is this really necessary? */
328:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

330:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
333:     } catch(char *ex) {
334:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335:     }
336:   }
337:   return(0);
338: }

340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343:   PetscInt                          n = A->rmap->n;
344:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346:   cusparseStatus_t                  stat;
347:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348:   const MatScalar                   *aa = a->a,*v;
349:   PetscInt                          *AiUp, *AjUp;
350:   PetscScalar                       *AAUp;
351:   PetscInt                          i,nz, nzUpper, offset;
352:   PetscErrorCode                    ierr;

355:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356:     try {
357:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358:       nzUpper = adiag[0]-adiag[n];

360:       /* Allocate Space for the upper triangular matrix */
361:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

365:       /* Fill the upper triangular matrix */
366:       AiUp[0]=(PetscInt) 0;
367:       AiUp[n]=nzUpper;
368:       offset = nzUpper;
369:       for (i=n-1; i>=0; i--) {
370:         v  = aa + adiag[i+1] + 1;
371:         vi = aj + adiag[i+1] + 1;

373:         /* number of elements NOT on the diagonal */
374:         nz = adiag[i] - adiag[i+1]-1;

376:         /* decrement the offset */
377:         offset -= (nz+1);

379:         /* first, set the diagonal elements */
380:         AjUp[offset] = (PetscInt) i;
381:         AAUp[offset] = (MatScalar)1./v[nz];
382:         AiUp[i]      = AiUp[i+1] - (nz+1);

384:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
385:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
386:       }

388:       /* allocate space for the triangular factor information */
389:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

391:       /* Create the matrix description */
392:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

398:       /* Create the solve analysis information */
399:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

401:       /* set the operation */
402:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

404:       /* set the matrix */
405:       upTriFactor->csrMat = new CsrMatrix;
406:       upTriFactor->csrMat->num_rows = n;
407:       upTriFactor->csrMat->num_cols = n;
408:       upTriFactor->csrMat->num_entries = nzUpper;

410:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

413:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

416:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

419:       /* perform the solve analysis */
420:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

425:       /* assign the pointer. Is this really necessary? */
426:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

428:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431:     } catch(char *ex) {
432:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433:     }
434:   }
435:   return(0);
436: }

438: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439: {
440:   PetscErrorCode               ierr;
441:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443:   IS                           isrow = a->row,iscol = a->icol;
444:   PetscBool                    row_identity,col_identity;
445:   const PetscInt               *r,*c;
446:   PetscInt                     n = A->rmap->n;

449:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
450:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

452:   cusparseTriFactors->workVector = new THRUSTARRAY;
453:   cusparseTriFactors->workVector->resize(n);
454:   cusparseTriFactors->nnz=a->nz;

456:   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457:   /*lower triangular indices */
458:   ISGetIndices(isrow,&r);
459:   ISIdentity(isrow,&row_identity);
460:   if (!row_identity) {
461:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462:     cusparseTriFactors->rpermIndices->assign(r, r+n);
463:   }
464:   ISRestoreIndices(isrow,&r);

466:   /*upper triangular indices */
467:   ISGetIndices(iscol,&c);
468:   ISIdentity(iscol,&col_identity);
469:   if (!col_identity) {
470:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471:     cusparseTriFactors->cpermIndices->assign(c, c+n);
472:   }
473:   ISRestoreIndices(iscol,&c);
474:   return(0);
475: }

477: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
478: {
479:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
480:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
481:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
482:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
483:   cusparseStatus_t                  stat;
484:   PetscErrorCode                    ierr;
485:   PetscInt                          *AiUp, *AjUp;
486:   PetscScalar                       *AAUp;
487:   PetscScalar                       *AALo;
488:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
489:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
490:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
491:   const MatScalar                   *aa = b->a,*v;

494:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
495:     try {
496:       /* Allocate Space for the upper triangular matrix */
497:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
498:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
499:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

502:       /* Fill the upper triangular matrix */
503:       AiUp[0]=(PetscInt) 0;
504:       AiUp[n]=nzUpper;
505:       offset = 0;
506:       for (i=0; i<n; i++) {
507:         /* set the pointers */
508:         v  = aa + ai[i];
509:         vj = aj + ai[i];
510:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

512:         /* first, set the diagonal elements */
513:         AjUp[offset] = (PetscInt) i;
514:         AAUp[offset] = (MatScalar)1.0/v[nz];
515:         AiUp[i]      = offset;
516:         AALo[offset] = (MatScalar)1.0/v[nz];

518:         offset+=1;
519:         if (nz>0) {
520:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
521:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
522:           for (j=offset; j<offset+nz; j++) {
523:             AAUp[j] = -AAUp[j];
524:             AALo[j] = AAUp[j]/v[nz];
525:           }
526:           offset+=nz;
527:         }
528:       }

530:       /* allocate space for the triangular factor information */
531:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

533:       /* Create the matrix description */
534:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
535:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
536:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
537:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
538:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

540:       /* Create the solve analysis information */
541:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

543:       /* set the operation */
544:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

546:       /* set the matrix */
547:       upTriFactor->csrMat = new CsrMatrix;
548:       upTriFactor->csrMat->num_rows = A->rmap->n;
549:       upTriFactor->csrMat->num_cols = A->cmap->n;
550:       upTriFactor->csrMat->num_entries = a->nz;

552:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
553:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

555:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
556:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

558:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
559:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

561:       /* perform the solve analysis */
562:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
563:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
564:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
565:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

567:       /* assign the pointer. Is this really necessary? */
568:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

570:       /* allocate space for the triangular factor information */
571:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

573:       /* Create the matrix description */
574:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
575:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
576:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
577:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
578:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

580:       /* Create the solve analysis information */
581:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

583:       /* set the operation */
584:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

586:       /* set the matrix */
587:       loTriFactor->csrMat = new CsrMatrix;
588:       loTriFactor->csrMat->num_rows = A->rmap->n;
589:       loTriFactor->csrMat->num_cols = A->cmap->n;
590:       loTriFactor->csrMat->num_entries = a->nz;

592:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
593:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

595:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
596:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

598:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
599:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

601:       /* perform the solve analysis */
602:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
603:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
604:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
605:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

607:       /* assign the pointer. Is this really necessary? */
608:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

610:       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
611:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
612:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
613:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
614:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
615:     } catch(char *ex) {
616:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
617:     }
618:   }
619:   return(0);
620: }

622: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
623: {
624:   PetscErrorCode               ierr;
625:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
626:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627:   IS                           ip = a->row;
628:   const PetscInt               *rip;
629:   PetscBool                    perm_identity;
630:   PetscInt                     n = A->rmap->n;

633:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
634:   cusparseTriFactors->workVector = new THRUSTARRAY;
635:   cusparseTriFactors->workVector->resize(n);
636:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

638:   /*lower triangular indices */
639:   ISGetIndices(ip,&rip);
640:   ISIdentity(ip,&perm_identity);
641:   if (!perm_identity) {
642:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
644:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
645:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
646:   }
647:   ISRestoreIndices(ip,&rip);
648:   return(0);
649: }

651: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
652: {
653:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
654:   IS             isrow = b->row,iscol = b->col;
655:   PetscBool      row_identity,col_identity;

659:   MatLUFactorNumeric_SeqAIJ(B,A,info);
660:   /* determine which version of MatSolve needs to be used. */
661:   ISIdentity(isrow,&row_identity);
662:   ISIdentity(iscol,&col_identity);
663:   if (row_identity && col_identity) {
664:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
665:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
666:   } else {
667:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
668:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
669:   }

671:   /* get the triangular factors */
672:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
673:   return(0);
674: }

676: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677: {
678:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
679:   IS             ip = b->row;
680:   PetscBool      perm_identity;

684:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

686:   /* determine which version of MatSolve needs to be used. */
687:   ISIdentity(ip,&perm_identity);
688:   if (perm_identity) {
689:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691:   } else {
692:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694:   }

696:   /* get the triangular factors */
697:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
698:   return(0);
699: }

701: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
702: {
703:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
707:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
708:   cusparseStatus_t                  stat;
709:   cusparseIndexBase_t               indexBase;
710:   cusparseMatrixType_t              matrixType;
711:   cusparseFillMode_t                fillMode;
712:   cusparseDiagType_t                diagType;


716:   /*********************************************/
717:   /* Now the Transpose of the Lower Tri Factor */
718:   /*********************************************/

720:   /* allocate space for the transpose of the lower triangular factor */
721:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

723:   /* set the matrix descriptors of the lower triangular factor */
724:   matrixType = cusparseGetMatType(loTriFactor->descr);
725:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
726:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
727:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
728:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

730:   /* Create the matrix description */
731:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
732:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
733:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
734:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
735:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

737:   /* Create the solve analysis information */
738:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

740:   /* set the operation */
741:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

743:   /* allocate GPU space for the CSC of the lower triangular factor*/
744:   loTriFactorT->csrMat = new CsrMatrix;
745:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
746:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
747:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
748:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
749:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
750:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

752:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
753:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
754:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
755:                           loTriFactor->csrMat->values->data().get(),
756:                           loTriFactor->csrMat->row_offsets->data().get(),
757:                           loTriFactor->csrMat->column_indices->data().get(),
758:                           loTriFactorT->csrMat->values->data().get(),
759:                           loTriFactorT->csrMat->column_indices->data().get(),
760:                           loTriFactorT->csrMat->row_offsets->data().get(),
761:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

763:   /* perform the solve analysis on the transposed matrix */
764:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
765:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
766:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
767:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
768:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

770:   /* assign the pointer. Is this really necessary? */
771:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

773:   /*********************************************/
774:   /* Now the Transpose of the Upper Tri Factor */
775:   /*********************************************/

777:   /* allocate space for the transpose of the upper triangular factor */
778:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

780:   /* set the matrix descriptors of the upper triangular factor */
781:   matrixType = cusparseGetMatType(upTriFactor->descr);
782:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
783:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
784:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
785:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

787:   /* Create the matrix description */
788:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
789:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
790:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
791:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
792:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

794:   /* Create the solve analysis information */
795:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

797:   /* set the operation */
798:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

800:   /* allocate GPU space for the CSC of the upper triangular factor*/
801:   upTriFactorT->csrMat = new CsrMatrix;
802:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
803:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
804:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
805:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
806:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
807:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

809:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
810:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
811:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
812:                           upTriFactor->csrMat->values->data().get(),
813:                           upTriFactor->csrMat->row_offsets->data().get(),
814:                           upTriFactor->csrMat->column_indices->data().get(),
815:                           upTriFactorT->csrMat->values->data().get(),
816:                           upTriFactorT->csrMat->column_indices->data().get(),
817:                           upTriFactorT->csrMat->row_offsets->data().get(),
818:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

820:   /* perform the solve analysis on the transposed matrix */
821:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
822:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
823:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
824:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
825:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

827:   /* assign the pointer. Is this really necessary? */
828:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
829:   return(0);
830: }

832: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
833: {
834:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
835:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
836:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
837:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
838:   cusparseStatus_t             stat;
839:   cusparseIndexBase_t          indexBase;
840:   cudaError_t                  err;


844:   /* allocate space for the triangular factor information */
845:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
846:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
847:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
848:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
849:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

851:   /* set alpha and beta */
852:   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
853:   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854:   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
855:   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
856:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

858:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
859:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
860:     CsrMatrix *matrixT= new CsrMatrix;
861:     matrixT->num_rows = A->cmap->n;
862:     matrixT->num_cols = A->rmap->n;
863:     matrixT->num_entries = a->nz;
864:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
865:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
866:     matrixT->values = new THRUSTARRAY(a->nz);

868:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
869:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
870:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
871:                             matrix->num_cols, matrix->num_entries,
872:                             matrix->values->data().get(),
873:                             matrix->row_offsets->data().get(),
874:                             matrix->column_indices->data().get(),
875:                             matrixT->values->data().get(),
876:                             matrixT->column_indices->data().get(),
877:                             matrixT->row_offsets->data().get(),
878:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

880:     /* assign the pointer */
881:     matstructT->mat = matrixT;

883:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
884: #if CUDA_VERSION>=5000
885:     /* First convert HYB to CSR */
886:     CsrMatrix *temp= new CsrMatrix;
887:     temp->num_rows = A->rmap->n;
888:     temp->num_cols = A->cmap->n;
889:     temp->num_entries = a->nz;
890:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
891:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
892:     temp->values = new THRUSTARRAY(a->nz);


895:     stat = cusparse_hyb2csr(cusparsestruct->handle,
896:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
897:                             temp->values->data().get(),
898:                             temp->row_offsets->data().get(),
899:                             temp->column_indices->data().get());CHKERRCUDA(stat);

901:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
902:     CsrMatrix *tempT= new CsrMatrix;
903:     tempT->num_rows = A->rmap->n;
904:     tempT->num_cols = A->cmap->n;
905:     tempT->num_entries = a->nz;
906:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
908:     tempT->values = new THRUSTARRAY(a->nz);

910:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
911:                             temp->num_cols, temp->num_entries,
912:                             temp->values->data().get(),
913:                             temp->row_offsets->data().get(),
914:                             temp->column_indices->data().get(),
915:                             tempT->values->data().get(),
916:                             tempT->column_indices->data().get(),
917:                             tempT->row_offsets->data().get(),
918:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

920:     /* Last, convert CSC to HYB */
921:     cusparseHybMat_t hybMat;
922:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
923:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
924:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
925:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
926:                             matstructT->descr, tempT->values->data().get(),
927:                             tempT->row_offsets->data().get(),
928:                             tempT->column_indices->data().get(),
929:                             hybMat, 0, partition);CHKERRCUDA(stat);

931:     /* assign the pointer */
932:     matstructT->mat = hybMat;

934:     /* delete temporaries */
935:     if (tempT) {
936:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
937:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
938:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
939:       delete (CsrMatrix*) tempT;
940:     }
941:     if (temp) {
942:       if (temp->values) delete (THRUSTARRAY*) temp->values;
943:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
944:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
945:       delete (CsrMatrix*) temp;
946:     }
947: #else
948:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
949: #endif
950:   }
951:   /* assign the compressed row indices */
952:   matstructT->cprowIndices = new THRUSTINTARRAY;
953:   matstructT->cprowIndices->resize(A->cmap->n);
954:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());

956:   /* assign the pointer */
957:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
958:   return(0);
959: }

961: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
962: {
963:   PetscInt                              n = xx->map->n;
964:   const PetscScalar                     *barray;
965:   PetscScalar                           *xarray;
966:   thrust::device_ptr<const PetscScalar> bGPU;
967:   thrust::device_ptr<PetscScalar>       xGPU;
968:   cusparseStatus_t                      stat;
969:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
970:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
971:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
972:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
973:   PetscErrorCode                        ierr;

976:   /* Analyze the matrix and create the transpose ... on the fly */
977:   if (!loTriFactorT && !upTriFactorT) {
978:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
979:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
980:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
981:   }

983:   /* Get the GPU pointers */
984:   VecCUDAGetArrayWrite(xx,&xarray);
985:   VecCUDAGetArrayRead(bb,&barray);
986:   xGPU = thrust::device_pointer_cast(xarray);
987:   bGPU = thrust::device_pointer_cast(barray);

989:   /* First, reorder with the row permutation */
990:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
991:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
992:                xGPU);

994:   /* First, solve U */
995:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
996:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
997:                         upTriFactorT->csrMat->values->data().get(),
998:                         upTriFactorT->csrMat->row_offsets->data().get(),
999:                         upTriFactorT->csrMat->column_indices->data().get(),
1000:                         upTriFactorT->solveInfo,
1001:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1003:   /* Then, solve L */
1004:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1005:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1006:                         loTriFactorT->csrMat->values->data().get(),
1007:                         loTriFactorT->csrMat->row_offsets->data().get(),
1008:                         loTriFactorT->csrMat->column_indices->data().get(),
1009:                         loTriFactorT->solveInfo,
1010:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1012:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1013:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1014:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1015:                tempGPU->begin());

1017:   /* Copy the temporary to the full solution. */
1018:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1020:   /* restore */
1021:   VecCUDARestoreArrayRead(bb,&barray);
1022:   VecCUDARestoreArrayWrite(xx,&xarray);
1023:   WaitForGPU();CHKERRCUDA(ierr);

1025:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1026:   return(0);
1027: }

1029: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1030: {
1031:   const PetscScalar                 *barray;
1032:   PetscScalar                       *xarray;
1033:   cusparseStatus_t                  stat;
1034:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1035:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1036:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1037:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1038:   PetscErrorCode                    ierr;

1041:   /* Analyze the matrix and create the transpose ... on the fly */
1042:   if (!loTriFactorT && !upTriFactorT) {
1043:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1044:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046:   }

1048:   /* Get the GPU pointers */
1049:   VecCUDAGetArrayWrite(xx,&xarray);
1050:   VecCUDAGetArrayRead(bb,&barray);

1052:   /* First, solve U */
1053:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054:                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1055:                         upTriFactorT->csrMat->values->data().get(),
1056:                         upTriFactorT->csrMat->row_offsets->data().get(),
1057:                         upTriFactorT->csrMat->column_indices->data().get(),
1058:                         upTriFactorT->solveInfo,
1059:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1061:   /* Then, solve L */
1062:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1063:                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1064:                         loTriFactorT->csrMat->values->data().get(),
1065:                         loTriFactorT->csrMat->row_offsets->data().get(),
1066:                         loTriFactorT->csrMat->column_indices->data().get(),
1067:                         loTriFactorT->solveInfo,
1068:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1070:   /* restore */
1071:   VecCUDARestoreArrayRead(bb,&barray);
1072:   VecCUDARestoreArrayWrite(xx,&xarray);
1073:   WaitForGPU();CHKERRCUDA(ierr);
1074:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1075:   return(0);
1076: }

1078: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1079: {
1080:   const PetscScalar                     *barray;
1081:   PetscScalar                           *xarray;
1082:   thrust::device_ptr<const PetscScalar> bGPU;
1083:   thrust::device_ptr<PetscScalar>       xGPU;
1084:   cusparseStatus_t                      stat;
1085:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1086:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1087:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1088:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1089:   PetscErrorCode                        ierr;


1093:   /* Get the GPU pointers */
1094:   VecCUDAGetArrayWrite(xx,&xarray);
1095:   VecCUDAGetArrayRead(bb,&barray);
1096:   xGPU = thrust::device_pointer_cast(xarray);
1097:   bGPU = thrust::device_pointer_cast(barray);

1099:   /* First, reorder with the row permutation */
1100:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1101:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1102:                xGPU);

1104:   /* Next, solve L */
1105:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1106:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1107:                         loTriFactor->csrMat->values->data().get(),
1108:                         loTriFactor->csrMat->row_offsets->data().get(),
1109:                         loTriFactor->csrMat->column_indices->data().get(),
1110:                         loTriFactor->solveInfo,
1111:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1113:   /* Then, solve U */
1114:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1115:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1116:                         upTriFactor->csrMat->values->data().get(),
1117:                         upTriFactor->csrMat->row_offsets->data().get(),
1118:                         upTriFactor->csrMat->column_indices->data().get(),
1119:                         upTriFactor->solveInfo,
1120:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1122:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1124:                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1125:                tempGPU->begin());

1127:   /* Copy the temporary to the full solution. */
1128:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1130:   VecCUDARestoreArrayRead(bb,&barray);
1131:   VecCUDARestoreArrayWrite(xx,&xarray);
1132:   WaitForGPU();CHKERRCUDA(ierr);
1133:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1134:   return(0);
1135: }

1137: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1138: {
1139:   const PetscScalar                 *barray;
1140:   PetscScalar                       *xarray;
1141:   cusparseStatus_t                  stat;
1142:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1143:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1144:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1145:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1146:   PetscErrorCode                    ierr;

1149:   /* Get the GPU pointers */
1150:   VecCUDAGetArrayWrite(xx,&xarray);
1151:   VecCUDAGetArrayRead(bb,&barray);

1153:   /* First, solve L */
1154:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1155:                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1156:                         loTriFactor->csrMat->values->data().get(),
1157:                         loTriFactor->csrMat->row_offsets->data().get(),
1158:                         loTriFactor->csrMat->column_indices->data().get(),
1159:                         loTriFactor->solveInfo,
1160:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1162:   /* Next, solve U */
1163:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1164:                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1165:                         upTriFactor->csrMat->values->data().get(),
1166:                         upTriFactor->csrMat->row_offsets->data().get(),
1167:                         upTriFactor->csrMat->column_indices->data().get(),
1168:                         upTriFactor->solveInfo,
1169:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1171:   VecCUDARestoreArrayRead(bb,&barray);
1172:   VecCUDARestoreArrayWrite(xx,&xarray);
1173:   WaitForGPU();CHKERRCUDA(ierr);
1174:   PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1175:   return(0);
1176: }

1178: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1179: {

1181:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1182:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1183:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1184:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1185:   PetscErrorCode               ierr;
1186:   cusparseStatus_t             stat;
1187:   cudaError_t                  err;

1190:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1191:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1192:     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1193:       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1194:       /* copy values only */
1195:       matrix->values->assign(a->a, a->a+a->nz);
1196:     } else {
1197:       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1198:       try {
1199:         cusparsestruct->nonzerorow=0;
1200:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1202:         if (a->compressedrow.use) {
1203:           m    = a->compressedrow.nrows;
1204:           ii   = a->compressedrow.i;
1205:           ridx = a->compressedrow.rindex;
1206:         } else {
1207:           /* Forcing compressed row on the GPU */
1208:           int k=0;
1209:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1210:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1211:           ii[0]=0;
1212:           for (int j = 0; j<m; j++) {
1213:             if ((a->i[j+1]-a->i[j])>0) {
1214:               ii[k]  = a->i[j];
1215:               ridx[k]= j;
1216:               k++;
1217:             }
1218:           }
1219:           ii[cusparsestruct->nonzerorow] = a->nz;
1220:           m = cusparsestruct->nonzerorow;
1221:         }

1223:         /* allocate space for the triangular factor information */
1224:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1225:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1226:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1227:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1229:         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1230:         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231:         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1232:         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1235:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1236:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1237:           /* set the matrix */
1238:           CsrMatrix *matrix= new CsrMatrix;
1239:           matrix->num_rows = m;
1240:           matrix->num_cols = A->cmap->n;
1241:           matrix->num_entries = a->nz;
1242:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1243:           matrix->row_offsets->assign(ii, ii + m+1);

1245:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1246:           matrix->column_indices->assign(a->j, a->j+a->nz);

1248:           matrix->values = new THRUSTARRAY(a->nz);
1249:           matrix->values->assign(a->a, a->a+a->nz);

1251:           /* assign the pointer */
1252:           matstruct->mat = matrix;

1254:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1255: #if CUDA_VERSION>=4020
1256:           CsrMatrix *matrix= new CsrMatrix;
1257:           matrix->num_rows = m;
1258:           matrix->num_cols = A->cmap->n;
1259:           matrix->num_entries = a->nz;
1260:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1261:           matrix->row_offsets->assign(ii, ii + m+1);

1263:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1264:           matrix->column_indices->assign(a->j, a->j+a->nz);

1266:           matrix->values = new THRUSTARRAY(a->nz);
1267:           matrix->values->assign(a->a, a->a+a->nz);

1269:           cusparseHybMat_t hybMat;
1270:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1271:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1272:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1273:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1274:               matstruct->descr, matrix->values->data().get(),
1275:               matrix->row_offsets->data().get(),
1276:               matrix->column_indices->data().get(),
1277:               hybMat, 0, partition);CHKERRCUDA(stat);
1278:           /* assign the pointer */
1279:           matstruct->mat = hybMat;

1281:           if (matrix) {
1282:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1283:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1284:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1285:             delete (CsrMatrix*)matrix;
1286:           }
1287: #endif
1288:         }

1290:         /* assign the compressed row indices */
1291:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1292:         matstruct->cprowIndices->assign(ridx,ridx+m);

1294:         /* assign the pointer */
1295:         cusparsestruct->mat = matstruct;

1297:         if (!a->compressedrow.use) {
1298:           PetscFree(ii);
1299:           PetscFree(ridx);
1300:         }
1301:         cusparsestruct->workVector = new THRUSTARRAY;
1302:         cusparsestruct->workVector->resize(m);
1303:       } catch(char *ex) {
1304:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305:       }
1306:       cusparsestruct->nonzerostate = A->nonzerostate;
1307:     }
1308:     WaitForGPU();CHKERRCUDA(ierr);
1309:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1311:   }
1312:   return(0);
1313: }

1315: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1316: {
1318:   PetscInt rbs,cbs;

1321:   MatGetBlockSizes(mat,&rbs,&cbs);
1322:   if (right) {
1323:     VecCreate(PetscObjectComm((PetscObject)mat),right);
1324:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1325:     VecSetBlockSize(*right,cbs);
1326:     VecSetType(*right,VECSEQCUDA);
1327:     PetscLayoutReference(mat->cmap,&(*right)->map);
1328:   }
1329:   if (left) {
1330:     VecCreate(PetscObjectComm((PetscObject)mat),left);
1331:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1332:     VecSetBlockSize(*left,rbs);
1333:     VecSetType(*left,VECSEQCUDA);
1334:     PetscLayoutReference(mat->rmap,&(*left)->map);
1335:   }
1336:   return(0);
1337: }

1339: struct VecCUDAPlusEquals
1340: {
1341:   template <typename Tuple>
1342:   __host__ __device__
1343:   void operator()(Tuple t)
1344:   {
1345:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1346:   }
1347: };

1349: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1350: {
1351:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1352:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1353:   Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1354:   const PetscScalar            *xarray;
1355:   PetscScalar                  *yarray;
1356:   PetscErrorCode               ierr;
1357:   cusparseStatus_t             stat;

1360:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1361:   MatSeqAIJCUSPARSECopyToGPU(A);
1362:   VecCUDAGetArrayRead(xx,&xarray);
1363:   VecSet(yy,0);
1364:   VecCUDAGetArrayWrite(yy,&yarray);
1365:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1366:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1367:     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1368:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1369:                              mat->num_rows, mat->num_cols, mat->num_entries,
1370:                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1371:                              mat->column_indices->data().get(), xarray, matstruct->beta,
1372:                              yarray);CHKERRCUDA(stat);
1373:   } else {
1374: #if CUDA_VERSION>=4020
1375:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1376:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1377:                              matstruct->alpha, matstruct->descr, hybMat,
1378:                              xarray, matstruct->beta,
1379:                              yarray);CHKERRCUDA(stat);
1380: #endif
1381:   }
1382:   VecCUDARestoreArrayRead(xx,&xarray);
1383:   VecCUDARestoreArrayWrite(yy,&yarray);
1384:   if (!cusparsestruct->stream) {
1385:     WaitForGPU();CHKERRCUDA(ierr);
1386:   }
1387:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1388:   return(0);
1389: }

1391: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1392: {
1393:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1394:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1395:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1396:   const PetscScalar            *xarray;
1397:   PetscScalar                  *yarray;
1398:   PetscErrorCode               ierr;
1399:   cusparseStatus_t             stat;

1402:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1403:   MatSeqAIJCUSPARSECopyToGPU(A);
1404:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1405:   if (!matstructT) {
1406:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1407:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1408:   }
1409:   VecCUDAGetArrayRead(xx,&xarray);
1410:   VecSet(yy,0);
1411:   VecCUDAGetArrayWrite(yy,&yarray);

1413:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1414:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1415:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1416:                              mat->num_rows, mat->num_cols,
1417:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1418:                              mat->values->data().get(), mat->row_offsets->data().get(),
1419:                              mat->column_indices->data().get(), xarray, matstructT->beta,
1420:                              yarray);CHKERRCUDA(stat);
1421:   } else {
1422: #if CUDA_VERSION>=4020
1423:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1424:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1425:                              matstructT->alpha, matstructT->descr, hybMat,
1426:                              xarray, matstructT->beta,
1427:                              yarray);CHKERRCUDA(stat);
1428: #endif
1429:   }
1430:   VecCUDARestoreArrayRead(xx,&xarray);
1431:   VecCUDARestoreArrayWrite(yy,&yarray);
1432:   if (!cusparsestruct->stream) {
1433:     WaitForGPU();CHKERRCUDA(ierr);
1434:   }
1435:   PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1436:   return(0);
1437: }


1440: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1441: {
1442:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1443:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1444:   Mat_SeqAIJCUSPARSEMultStruct    *matstruct;
1445:   thrust::device_ptr<PetscScalar> zptr;
1446:   const PetscScalar               *xarray;
1447:   PetscScalar                     *zarray;
1448:   PetscErrorCode                  ierr;
1449:   cusparseStatus_t                stat;

1452:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1453:   MatSeqAIJCUSPARSECopyToGPU(A);
1454:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1455:   try {
1456:     VecCopy_SeqCUDA(yy,zz);
1457:     VecCUDAGetArrayRead(xx,&xarray);
1458:     VecCUDAGetArrayReadWrite(zz,&zarray);
1459:     zptr = thrust::device_pointer_cast(zarray);

1461:     /* multiply add */
1462:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1463:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1464:     /* here we need to be careful to set the number of rows in the multiply to the
1465:        number of compressed rows in the matrix ... which is equivalent to the
1466:        size of the workVector */
1467:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1468:                                mat->num_rows, mat->num_cols,
1469:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1470:                                mat->values->data().get(), mat->row_offsets->data().get(),
1471:                                mat->column_indices->data().get(), xarray, matstruct->beta,
1472:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1473:     } else {
1474: #if CUDA_VERSION>=4020
1475:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1476:       if (cusparsestruct->workVector->size()) {
1477:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478:             matstruct->alpha, matstruct->descr, hybMat,
1479:             xarray, matstruct->beta,
1480:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1481:       }
1482: #endif
1483:     }

1485:     /* scatter the data from the temporary into the full vector with a += operation */
1486:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1487:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1488:         VecCUDAPlusEquals());
1489:     VecCUDARestoreArrayRead(xx,&xarray);
1490:     VecCUDARestoreArrayReadWrite(zz,&zarray);

1492:   } catch(char *ex) {
1493:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1494:   }
1495:   WaitForGPU();CHKERRCUDA(ierr);
1496:   PetscLogFlops(2.0*a->nz);
1497:   return(0);
1498: }

1500: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1501: {
1502:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1503:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1504:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1505:   thrust::device_ptr<PetscScalar> zptr;
1506:   const PetscScalar               *xarray;
1507:   PetscScalar                     *zarray;
1508:   PetscErrorCode                  ierr;
1509:   cusparseStatus_t                stat;

1512:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1513:   MatSeqAIJCUSPARSECopyToGPU(A);
1514:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1515:   if (!matstructT) {
1516:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1517:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1518:   }

1520:   try {
1521:     VecCopy_SeqCUDA(yy,zz);
1522:     VecCUDAGetArrayRead(xx,&xarray);
1523:     VecCUDAGetArrayReadWrite(zz,&zarray);
1524:     zptr = thrust::device_pointer_cast(zarray);

1526:     /* multiply add with matrix transpose */
1527:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1528:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1529:       /* here we need to be careful to set the number of rows in the multiply to the
1530:          number of compressed rows in the matrix ... which is equivalent to the
1531:          size of the workVector */
1532:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1533:                                mat->num_rows, mat->num_cols,
1534:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1535:                                mat->values->data().get(), mat->row_offsets->data().get(),
1536:                                mat->column_indices->data().get(), xarray, matstructT->beta,
1537:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1538:     } else {
1539: #if CUDA_VERSION>=4020
1540:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1541:       if (cusparsestruct->workVector->size()) {
1542:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1543:             matstructT->alpha, matstructT->descr, hybMat,
1544:             xarray, matstructT->beta,
1545:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1546:       }
1547: #endif
1548:     }

1550:     /* scatter the data from the temporary into the full vector with a += operation */
1551:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1552:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1553:         VecCUDAPlusEquals());

1555:     VecCUDARestoreArrayRead(xx,&xarray);
1556:     VecCUDARestoreArrayReadWrite(zz,&zarray);

1558:   } catch(char *ex) {
1559:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1560:   }
1561:   WaitForGPU();CHKERRCUDA(ierr);
1562:   PetscLogFlops(2.0*a->nz);
1563:   return(0);
1564: }

1566: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1567: {

1571:   MatAssemblyEnd_SeqAIJ(A,mode);
1572:   if (A->factortype==MAT_FACTOR_NONE) {
1573:     MatSeqAIJCUSPARSECopyToGPU(A);
1574:   }
1575:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1576:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1577:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1578:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1579:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1580:   return(0);
1581: }

1583: /* --------------------------------------------------------------------------------*/
1584: /*@
1585:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1586:    (the default parallel PETSc format). This matrix will ultimately pushed down
1587:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1588:    assembly performance the user should preallocate the matrix storage by setting
1589:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1590:    performance during matrix assembly can be increased by more than a factor of 50.

1592:    Collective on MPI_Comm

1594:    Input Parameters:
1595: +  comm - MPI communicator, set to PETSC_COMM_SELF
1596: .  m - number of rows
1597: .  n - number of columns
1598: .  nz - number of nonzeros per row (same for all rows)
1599: -  nnz - array containing the number of nonzeros in the various rows
1600:          (possibly different for each row) or NULL

1602:    Output Parameter:
1603: .  A - the matrix

1605:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1606:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1607:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1609:    Notes:
1610:    If nnz is given then nz is ignored

1612:    The AIJ format (also called the Yale sparse matrix format or
1613:    compressed row storage), is fully compatible with standard Fortran 77
1614:    storage.  That is, the stored row and column indices can begin at
1615:    either one (as in Fortran) or zero.  See the users' manual for details.

1617:    Specify the preallocated storage with either nz or nnz (not both).
1618:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1619:    allocation.  For large problems you MUST preallocate memory or you
1620:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1622:    By default, this format uses inodes (identical nodes) when possible, to
1623:    improve numerical efficiency of matrix-vector products and solves. We
1624:    search for consecutive rows with the same nonzero structure, thereby
1625:    reusing matrix information to achieve increased efficiency.

1627:    Level: intermediate

1629: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1630: @*/
1631: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1632: {

1636:   MatCreate(comm,A);
1637:   MatSetSizes(*A,m,n,m,n);
1638:   MatSetType(*A,MATSEQAIJCUSPARSE);
1639:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1640:   return(0);
1641: }

1643: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1644: {
1645:   PetscErrorCode   ierr;

1648:   if (A->factortype==MAT_FACTOR_NONE) {
1649:     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1650:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1651:     }
1652:   } else {
1653:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1654:   }
1655:   MatDestroy_SeqAIJ(A);
1656:   return(0);
1657: }

1659: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1660: {
1662:   Mat C;
1663:   cusparseStatus_t stat;
1664:   cusparseHandle_t handle=0;

1667:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1668:   C = *B;
1669:   /* inject CUSPARSE-specific stuff */
1670:   if (C->factortype==MAT_FACTOR_NONE) {
1671:     /* you cannot check the inode.use flag here since the matrix was just created.
1672:        now build a GPU matrix data structure */
1673:     C->spptr = new Mat_SeqAIJCUSPARSE;
1674:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1675:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1676:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1677:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1678:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1679:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1680:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1681:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1682:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1683:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1684:   } else {
1685:     /* NEXT, set the pointers to the triangular factors */
1686:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1687:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1688:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1689:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1690:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1691:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1692:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1693:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1694:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1695:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1696:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1697:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1698:   }

1700:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1701:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1702:   C->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1703:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1704:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1705:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1706:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1707:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1708:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1710:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1712:   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1714:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1715:   return(0);
1716: }

1718: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1719: {
1721:   cusparseStatus_t stat;
1722:   cusparseHandle_t handle=0;

1725:   MatCreate_SeqAIJ(B);
1726:   if (B->factortype==MAT_FACTOR_NONE) {
1727:     /* you cannot check the inode.use flag here since the matrix was just created.
1728:        now build a GPU matrix data structure */
1729:     B->spptr = new Mat_SeqAIJCUSPARSE;
1730:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1731:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1732:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1733:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1734:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1735:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1736:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1737:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1738:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1739:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1740:   } else {
1741:     /* NEXT, set the pointers to the triangular factors */
1742:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1743:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1744:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1745:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1746:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1747:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1748:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1749:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1750:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1751:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1752:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1753:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1754:   }

1756:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1757:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1758:   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1759:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1760:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1761:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1762:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1763:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1764:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1766:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1768:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1770:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1771:   return(0);
1772: }

1774: /*MC
1775:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1777:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1778:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1779:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1781:    Options Database Keys:
1782: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1783: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1784: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1786:   Level: beginner

1788: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1789: M*/

1791: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1794: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1795: {

1799:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1800:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1801:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1802:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1803:   return(0);
1804: }


1807: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1808: {
1809:   cusparseStatus_t stat;
1810:   cusparseHandle_t handle;

1813:   if (*cusparsestruct) {
1814:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1815:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1816:     delete (*cusparsestruct)->workVector;
1817:     if (handle = (*cusparsestruct)->handle) {
1818:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1819:     }
1820:     delete *cusparsestruct;
1821:     *cusparsestruct = 0;
1822:   }
1823:   return(0);
1824: }

1826: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1827: {
1829:   if (*mat) {
1830:     delete (*mat)->values;
1831:     delete (*mat)->column_indices;
1832:     delete (*mat)->row_offsets;
1833:     delete *mat;
1834:     *mat = 0;
1835:   }
1836:   return(0);
1837: }

1839: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1840: {
1841:   cusparseStatus_t stat;
1842:   PetscErrorCode   ierr;

1845:   if (*trifactor) {
1846:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1847:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1848:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1849:     delete *trifactor;
1850:     *trifactor = 0;
1851:   }
1852:   return(0);
1853: }

1855: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1856: {
1857:   CsrMatrix        *mat;
1858:   cusparseStatus_t stat;
1859:   cudaError_t      err;

1862:   if (*matstruct) {
1863:     if ((*matstruct)->mat) {
1864:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1865:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1866:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1867:       } else {
1868:         mat = (CsrMatrix*)(*matstruct)->mat;
1869:         CsrMatrix_Destroy(&mat);
1870:       }
1871:     }
1872:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1873:     delete (*matstruct)->cprowIndices;
1874:     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1875:     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1876:     delete *matstruct;
1877:     *matstruct = 0;
1878:   }
1879:   return(0);
1880: }

1882: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1883: {
1884:   cusparseHandle_t handle;
1885:   cusparseStatus_t stat;

1888:   if (*trifactors) {
1889:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1890:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1891:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1892:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1893:     delete (*trifactors)->rpermIndices;
1894:     delete (*trifactors)->cpermIndices;
1895:     delete (*trifactors)->workVector;
1896:     if (handle = (*trifactors)->handle) {
1897:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1898:     }
1899:     delete *trifactors;
1900:     *trifactors = 0;
1901:   }
1902:   return(0);
1903: }