Actual source code: aijcusparse.cu

petsc-3.13.6 2020-09-29
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10:  #include <../src/mat/impls/aij/seq/aij.h>
 11:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12:  #include <../src/vec/vec/impls/dvecimpl.h>
 13:  #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 41: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 44: {
 45:   cusparseStatus_t   stat;
 46:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 49:   cusparsestruct->stream = stream;
 50:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 51:   return(0);
 52: }

 54: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 55: {
 56:   cusparseStatus_t   stat;
 57:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 60:   if (cusparsestruct->handle != handle) {
 61:     if (cusparsestruct->handle) {
 62:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
 63:     }
 64:     cusparsestruct->handle = handle;
 65:   }
 66:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
 67:   return(0);
 68: }

 70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 71: {
 72:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 74:   if (cusparsestruct->handle)
 75:     cusparsestruct->handle = 0;
 76:   return(0);
 77: }

 79: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 80: {
 82:   *type = MATSOLVERCUSPARSE;
 83:   return(0);
 84: }

 86: /*MC
 87:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 88:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 89:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 90:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 91:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 92:   algorithms are not recommended. This class does NOT support direct solver operations.

 94:   Level: beginner

 96: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 97: M*/

 99: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100: {
102:   PetscInt       n = A->rmap->n;

105:   MatCreate(PetscObjectComm((PetscObject)A),B);
106:   (*B)->factortype = ftype;
107:   MatSetSizes(*B,n,n,n,n);
108:   MatSetType(*B,MATSEQAIJCUSPARSE);

110:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111:     MatSetBlockSizesFromMats(*B,A,A);
112:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

119:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
120:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
121:   return(0);
122: }

124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

129:   switch (op) {
130:   case MAT_CUSPARSE_MULT:
131:     cusparsestruct->format = format;
132:     break;
133:   case MAT_CUSPARSE_ALL:
134:     cusparsestruct->format = format;
135:     break;
136:   default:
137:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138:   }
139:   return(0);
140: }

142: /*@
143:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144:    operation. Only the MatMult operation can use different GPU storage formats
145:    for MPIAIJCUSPARSE matrices.
146:    Not Collective

148:    Input Parameters:
149: +  A - Matrix of type SEQAIJCUSPARSE
150: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

153:    Output Parameter:

155:    Level: intermediate

157: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: @*/
159: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160: {

165:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
166:   return(0);
167: }

169: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170: {
171:   PetscErrorCode           ierr;
172:   MatCUSPARSEStorageFormat format;
173:   PetscBool                flg;
174:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

177:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
178:   if (A->factortype==MAT_FACTOR_NONE) {
179:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
181:     if (flg) {
182:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
183:     }
184:   }
185:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
187:   if (flg) {
188:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
189:   }
190:   PetscOptionsTail();
191:   return(0);

193: }

195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {

200:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202:   return(0);
203: }

205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {

210:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212:   return(0);
213: }

215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {

220:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222:   return(0);
223: }

225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {

230:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232:   return(0);
233: }

235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238:   PetscInt                          n = A->rmap->n;
239:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241:   cusparseStatus_t                  stat;
242:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243:   const MatScalar                   *aa = a->a,*v;
244:   PetscInt                          *AiLo, *AjLo;
245:   PetscScalar                       *AALo;
246:   PetscInt                          i,nz, nzLower, offset, rowOffset;
247:   PetscErrorCode                    ierr;
248:   cudaError_t                       cerr;

251:   if (!n) return(0);
252:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253:     try {
254:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255:       nzLower=n+ai[n]-ai[1];

257:       /* Allocate Space for the lower triangular matrix */
258:       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259:       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

262:       /* Fill the lower triangular matrix */
263:       AiLo[0]  = (PetscInt) 0;
264:       AiLo[n]  = nzLower;
265:       AjLo[0]  = (PetscInt) 0;
266:       AALo[0]  = (MatScalar) 1.0;
267:       v        = aa;
268:       vi       = aj;
269:       offset   = 1;
270:       rowOffset= 1;
271:       for (i=1; i<n; i++) {
272:         nz = ai[i+1] - ai[i];
273:         /* additional 1 for the term on the diagonal */
274:         AiLo[i]    = rowOffset;
275:         rowOffset += nz+1;

277:         PetscArraycpy(&(AjLo[offset]), vi, nz);
278:         PetscArraycpy(&(AALo[offset]), v, nz);

280:         offset      += nz;
281:         AjLo[offset] = (PetscInt) i;
282:         AALo[offset] = (MatScalar) 1.0;
283:         offset      += 1;

285:         v  += nz;
286:         vi += nz;
287:       }

289:       /* allocate space for the triangular factor information */
290:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

292:       /* Create the matrix description */
293:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

299:       /* Create the solve analysis information */
300:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

302:       /* set the operation */
303:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

305:       /* set the matrix */
306:       loTriFactor->csrMat = new CsrMatrix;
307:       loTriFactor->csrMat->num_rows = n;
308:       loTriFactor->csrMat->num_cols = n;
309:       loTriFactor->csrMat->num_entries = nzLower;

311:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

314:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

317:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

320:       /* perform the solve analysis */
321:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

326:       /* assign the pointer. Is this really necessary? */
327:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

329:       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330:       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
333:     } catch(char *ex) {
334:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335:     }
336:   }
337:   return(0);
338: }

340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343:   PetscInt                          n = A->rmap->n;
344:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346:   cusparseStatus_t                  stat;
347:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348:   const MatScalar                   *aa = a->a,*v;
349:   PetscInt                          *AiUp, *AjUp;
350:   PetscScalar                       *AAUp;
351:   PetscInt                          i,nz, nzUpper, offset;
352:   PetscErrorCode                    ierr;
353:   cudaError_t                       cerr;

356:   if (!n) return(0);
357:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358:     try {
359:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360:       nzUpper = adiag[0]-adiag[n];

362:       /* Allocate Space for the upper triangular matrix */
363:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

367:       /* Fill the upper triangular matrix */
368:       AiUp[0]=(PetscInt) 0;
369:       AiUp[n]=nzUpper;
370:       offset = nzUpper;
371:       for (i=n-1; i>=0; i--) {
372:         v  = aa + adiag[i+1] + 1;
373:         vi = aj + adiag[i+1] + 1;

375:         /* number of elements NOT on the diagonal */
376:         nz = adiag[i] - adiag[i+1]-1;

378:         /* decrement the offset */
379:         offset -= (nz+1);

381:         /* first, set the diagonal elements */
382:         AjUp[offset] = (PetscInt) i;
383:         AAUp[offset] = (MatScalar)1./v[nz];
384:         AiUp[i]      = AiUp[i+1] - (nz+1);

386:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
387:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
388:       }

390:       /* allocate space for the triangular factor information */
391:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

393:       /* Create the matrix description */
394:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

400:       /* Create the solve analysis information */
401:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

403:       /* set the operation */
404:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

406:       /* set the matrix */
407:       upTriFactor->csrMat = new CsrMatrix;
408:       upTriFactor->csrMat->num_rows = n;
409:       upTriFactor->csrMat->num_cols = n;
410:       upTriFactor->csrMat->num_entries = nzUpper;

412:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

415:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

418:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

421:       /* perform the solve analysis */
422:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

427:       /* assign the pointer. Is this really necessary? */
428:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

430:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
434:     } catch(char *ex) {
435:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436:     }
437:   }
438:   return(0);
439: }

441: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442: {
443:   PetscErrorCode               ierr;
444:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446:   IS                           isrow = a->row,iscol = a->icol;
447:   PetscBool                    row_identity,col_identity;
448:   const PetscInt               *r,*c;
449:   PetscInt                     n = A->rmap->n;

452:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
453:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

455:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
456:   cusparseTriFactors->nnz=a->nz;

458:   A->offloadmask = PETSC_OFFLOAD_BOTH;
459:   /* lower triangular indices */
460:   ISGetIndices(isrow,&r);
461:   ISIdentity(isrow,&row_identity);
462:   if (!row_identity) {
463:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464:     cusparseTriFactors->rpermIndices->assign(r, r+n);
465:   }
466:   ISRestoreIndices(isrow,&r);

468:   /* upper triangular indices */
469:   ISGetIndices(iscol,&c);
470:   ISIdentity(iscol,&col_identity);
471:   if (!col_identity) {
472:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473:     cusparseTriFactors->cpermIndices->assign(c, c+n);
474:   }

476:   if (!row_identity && !col_identity) {
477:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
478:   } else if(!row_identity) {
479:     PetscLogCpuToGpu(n*sizeof(PetscInt));
480:   } else if(!col_identity) {
481:     PetscLogCpuToGpu(n*sizeof(PetscInt));
482:   }

484:   ISRestoreIndices(iscol,&c);
485:   return(0);
486: }

488: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489: {
490:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494:   cusparseStatus_t                  stat;
495:   PetscErrorCode                    ierr;
496:   cudaError_t                       cerr;
497:   PetscInt                          *AiUp, *AjUp;
498:   PetscScalar                       *AAUp;
499:   PetscScalar                       *AALo;
500:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
502:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
503:   const MatScalar                   *aa = b->a,*v;

506:   if (!n) return(0);
507:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508:     try {
509:       /* Allocate Space for the upper triangular matrix */
510:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

515:       /* Fill the upper triangular matrix */
516:       AiUp[0]=(PetscInt) 0;
517:       AiUp[n]=nzUpper;
518:       offset = 0;
519:       for (i=0; i<n; i++) {
520:         /* set the pointers */
521:         v  = aa + ai[i];
522:         vj = aj + ai[i];
523:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

525:         /* first, set the diagonal elements */
526:         AjUp[offset] = (PetscInt) i;
527:         AAUp[offset] = (MatScalar)1.0/v[nz];
528:         AiUp[i]      = offset;
529:         AALo[offset] = (MatScalar)1.0/v[nz];

531:         offset+=1;
532:         if (nz>0) {
533:           PetscArraycpy(&(AjUp[offset]), vj, nz);
534:           PetscArraycpy(&(AAUp[offset]), v, nz);
535:           for (j=offset; j<offset+nz; j++) {
536:             AAUp[j] = -AAUp[j];
537:             AALo[j] = AAUp[j]/v[nz];
538:           }
539:           offset+=nz;
540:         }
541:       }

543:       /* allocate space for the triangular factor information */
544:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

546:       /* Create the matrix description */
547:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

553:       /* Create the solve analysis information */
554:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

556:       /* set the operation */
557:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

559:       /* set the matrix */
560:       upTriFactor->csrMat = new CsrMatrix;
561:       upTriFactor->csrMat->num_rows = A->rmap->n;
562:       upTriFactor->csrMat->num_cols = A->cmap->n;
563:       upTriFactor->csrMat->num_entries = a->nz;

565:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

568:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

571:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

574:       /* perform the solve analysis */
575:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

580:       /* assign the pointer. Is this really necessary? */
581:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

583:       /* allocate space for the triangular factor information */
584:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

586:       /* Create the matrix description */
587:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

593:       /* Create the solve analysis information */
594:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

596:       /* set the operation */
597:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

599:       /* set the matrix */
600:       loTriFactor->csrMat = new CsrMatrix;
601:       loTriFactor->csrMat->num_rows = A->rmap->n;
602:       loTriFactor->csrMat->num_cols = A->cmap->n;
603:       loTriFactor->csrMat->num_entries = a->nz;

605:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

608:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

611:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

615:       /* perform the solve analysis */
616:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

621:       /* assign the pointer. Is this really necessary? */
622:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

624:       A->offloadmask = PETSC_OFFLOAD_BOTH;
625:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629:     } catch(char *ex) {
630:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631:     }
632:   }
633:   return(0);
634: }

636: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637: {
638:   PetscErrorCode               ierr;
639:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641:   IS                           ip = a->row;
642:   const PetscInt               *rip;
643:   PetscBool                    perm_identity;
644:   PetscInt                     n = A->rmap->n;

647:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
648:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
649:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

651:   /* lower triangular indices */
652:   ISGetIndices(ip,&rip);
653:   ISIdentity(ip,&perm_identity);
654:   if (!perm_identity) {
655:     IS             iip;
656:     const PetscInt *irip;

658:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
659:     ISGetIndices(iip,&irip);
660:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664:     ISRestoreIndices(iip,&irip);
665:     ISDestroy(&iip);
666:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
667:   }
668:   ISRestoreIndices(ip,&rip);
669:   return(0);
670: }

672: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673: {
674:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
675:   IS             isrow = b->row,iscol = b->col;
676:   PetscBool      row_identity,col_identity;

680:   MatLUFactorNumeric_SeqAIJ(B,A,info);
681:   /* determine which version of MatSolve needs to be used. */
682:   ISIdentity(isrow,&row_identity);
683:   ISIdentity(iscol,&col_identity);
684:   if (row_identity && col_identity) {
685:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687:     B->ops->matsolve = NULL;
688:     B->ops->matsolvetranspose = NULL;
689:   } else {
690:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692:     B->ops->matsolve = NULL;
693:     B->ops->matsolvetranspose = NULL;
694:   }

696:   /* get the triangular factors */
697:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
698:   return(0);
699: }

701: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702: {
703:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
704:   IS             ip = b->row;
705:   PetscBool      perm_identity;

709:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

711:   /* determine which version of MatSolve needs to be used. */
712:   ISIdentity(ip,&perm_identity);
713:   if (perm_identity) {
714:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716:     B->ops->matsolve = NULL;
717:     B->ops->matsolvetranspose = NULL;
718:   } else {
719:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721:     B->ops->matsolve = NULL;
722:     B->ops->matsolvetranspose = NULL;
723:   }

725:   /* get the triangular factors */
726:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
727:   return(0);
728: }

730: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731: {
732:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737:   cusparseStatus_t                  stat;
738:   cusparseIndexBase_t               indexBase;
739:   cusparseMatrixType_t              matrixType;
740:   cusparseFillMode_t                fillMode;
741:   cusparseDiagType_t                diagType;


745:   /*********************************************/
746:   /* Now the Transpose of the Lower Tri Factor */
747:   /*********************************************/

749:   /* allocate space for the transpose of the lower triangular factor */
750:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

752:   /* set the matrix descriptors of the lower triangular factor */
753:   matrixType = cusparseGetMatType(loTriFactor->descr);
754:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

759:   /* Create the matrix description */
760:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

766:   /* Create the solve analysis information */
767:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

769:   /* set the operation */
770:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

772:   /* allocate GPU space for the CSC of the lower triangular factor*/
773:   loTriFactorT->csrMat = new CsrMatrix;
774:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

781:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
782:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784:                           loTriFactor->csrMat->values->data().get(),
785:                           loTriFactor->csrMat->row_offsets->data().get(),
786:                           loTriFactor->csrMat->column_indices->data().get(),
787:                           loTriFactorT->csrMat->values->data().get(),
788:                           loTriFactorT->csrMat->column_indices->data().get(),
789:                           loTriFactorT->csrMat->row_offsets->data().get(),
790:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

792:   /* perform the solve analysis on the transposed matrix */
793:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797:                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

799:   /* assign the pointer. Is this really necessary? */
800:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

802:   /*********************************************/
803:   /* Now the Transpose of the Upper Tri Factor */
804:   /*********************************************/

806:   /* allocate space for the transpose of the upper triangular factor */
807:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

809:   /* set the matrix descriptors of the upper triangular factor */
810:   matrixType = cusparseGetMatType(upTriFactor->descr);
811:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

816:   /* Create the matrix description */
817:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

823:   /* Create the solve analysis information */
824:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

826:   /* set the operation */
827:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

829:   /* allocate GPU space for the CSC of the upper triangular factor*/
830:   upTriFactorT->csrMat = new CsrMatrix;
831:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

838:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
839:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841:                           upTriFactor->csrMat->values->data().get(),
842:                           upTriFactor->csrMat->row_offsets->data().get(),
843:                           upTriFactor->csrMat->column_indices->data().get(),
844:                           upTriFactorT->csrMat->values->data().get(),
845:                           upTriFactorT->csrMat->column_indices->data().get(),
846:                           upTriFactorT->csrMat->row_offsets->data().get(),
847:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

849:   /* perform the solve analysis on the transposed matrix */
850:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854:                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

856:   /* assign the pointer. Is this really necessary? */
857:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858:   return(0);
859: }

861: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862: {
863:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
867:   cusparseStatus_t             stat;
868:   cusparseIndexBase_t          indexBase;
869:   cudaError_t                  err;
870:   PetscErrorCode               ierr;


874:   /* allocate space for the triangular factor information */
875:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
878:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

881:   /* set alpha and beta */
882:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
883:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

890:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892:     CsrMatrix *matrixT= new CsrMatrix;
893:     matrixT->num_rows = A->cmap->n;
894:     matrixT->num_cols = A->rmap->n;
895:     matrixT->num_entries = a->nz;
896:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898:     matrixT->values = new THRUSTARRAY(a->nz);

900:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
901:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
902:     /* compute the transpose, i.e. the CSC */
903:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
904:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
905:                             A->cmap->n, matrix->num_entries,
906:                             matrix->values->data().get(),
907:                             cusparsestruct->rowoffsets_gpu->data().get(),
908:                             matrix->column_indices->data().get(),
909:                             matrixT->values->data().get(),
910:                             matrixT->column_indices->data().get(),
911:                             matrixT->row_offsets->data().get(),
912:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
913:     /* assign the pointer */
914:     matstructT->mat = matrixT;
915:     PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
916:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917:     /* First convert HYB to CSR */
918:     CsrMatrix *temp= new CsrMatrix;
919:     temp->num_rows = A->rmap->n;
920:     temp->num_cols = A->cmap->n;
921:     temp->num_entries = a->nz;
922:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924:     temp->values = new THRUSTARRAY(a->nz);


927:     stat = cusparse_hyb2csr(cusparsestruct->handle,
928:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929:                             temp->values->data().get(),
930:                             temp->row_offsets->data().get(),
931:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

933:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934:     CsrMatrix *tempT= new CsrMatrix;
935:     tempT->num_rows = A->rmap->n;
936:     tempT->num_cols = A->cmap->n;
937:     tempT->num_entries = a->nz;
938:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940:     tempT->values = new THRUSTARRAY(a->nz);

942:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943:                             temp->num_cols, temp->num_entries,
944:                             temp->values->data().get(),
945:                             temp->row_offsets->data().get(),
946:                             temp->column_indices->data().get(),
947:                             tempT->values->data().get(),
948:                             tempT->column_indices->data().get(),
949:                             tempT->row_offsets->data().get(),
950:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

952:     /* Last, convert CSC to HYB */
953:     cusparseHybMat_t hybMat;
954:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
955:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958:                             matstructT->descr, tempT->values->data().get(),
959:                             tempT->row_offsets->data().get(),
960:                             tempT->column_indices->data().get(),
961:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

963:     /* assign the pointer */
964:     matstructT->mat = hybMat;
965:     PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));

967:     /* delete temporaries */
968:     if (tempT) {
969:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972:       delete (CsrMatrix*) tempT;
973:     }
974:     if (temp) {
975:       if (temp->values) delete (THRUSTARRAY*) temp->values;
976:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978:       delete (CsrMatrix*) temp;
979:     }
980:   }
981:   /* assign the compressed row indices */
982:   matstructT->cprowIndices = new THRUSTINTARRAY;
983:   matstructT->cprowIndices->resize(A->cmap->n);
984:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
985:   /* assign the pointer */
986:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
987:   return(0);
988: }

990: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
991: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
992: {
993:   PetscInt                              n = xx->map->n;
994:   const PetscScalar                     *barray;
995:   PetscScalar                           *xarray;
996:   thrust::device_ptr<const PetscScalar> bGPU;
997:   thrust::device_ptr<PetscScalar>       xGPU;
998:   cusparseStatus_t                      stat;
999:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1001:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1002:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1003:   PetscErrorCode                        ierr;
1004:   cudaError_t                           cerr;

1007:   /* Analyze the matrix and create the transpose ... on the fly */
1008:   if (!loTriFactorT && !upTriFactorT) {
1009:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1010:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012:   }

1014:   /* Get the GPU pointers */
1015:   VecCUDAGetArrayWrite(xx,&xarray);
1016:   VecCUDAGetArrayRead(bb,&barray);
1017:   xGPU = thrust::device_pointer_cast(xarray);
1018:   bGPU = thrust::device_pointer_cast(barray);

1020:   PetscLogGpuTimeBegin();
1021:   /* First, reorder with the row permutation */
1022:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1023:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1024:                xGPU);

1026:   /* First, solve U */
1027:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1029:                         upTriFactorT->csrMat->values->data().get(),
1030:                         upTriFactorT->csrMat->row_offsets->data().get(),
1031:                         upTriFactorT->csrMat->column_indices->data().get(),
1032:                         upTriFactorT->solveInfo,
1033:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1035:   /* Then, solve L */
1036:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1038:                         loTriFactorT->csrMat->values->data().get(),
1039:                         loTriFactorT->csrMat->row_offsets->data().get(),
1040:                         loTriFactorT->csrMat->column_indices->data().get(),
1041:                         loTriFactorT->solveInfo,
1042:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1044:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1046:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1047:                tempGPU->begin());

1049:   /* Copy the temporary to the full solution. */
1050:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1052:   /* restore */
1053:   VecCUDARestoreArrayRead(bb,&barray);
1054:   VecCUDARestoreArrayWrite(xx,&xarray);
1055:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1056:   PetscLogGpuTimeEnd();
1057:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1058:   return(0);
1059: }

1061: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1062: {
1063:   const PetscScalar                 *barray;
1064:   PetscScalar                       *xarray;
1065:   cusparseStatus_t                  stat;
1066:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1067:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1068:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1069:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1070:   PetscErrorCode                    ierr;
1071:   cudaError_t                       cerr;

1074:   /* Analyze the matrix and create the transpose ... on the fly */
1075:   if (!loTriFactorT && !upTriFactorT) {
1076:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1077:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079:   }

1081:   /* Get the GPU pointers */
1082:   VecCUDAGetArrayWrite(xx,&xarray);
1083:   VecCUDAGetArrayRead(bb,&barray);

1085:   PetscLogGpuTimeBegin();
1086:   /* First, solve U */
1087:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1089:                         upTriFactorT->csrMat->values->data().get(),
1090:                         upTriFactorT->csrMat->row_offsets->data().get(),
1091:                         upTriFactorT->csrMat->column_indices->data().get(),
1092:                         upTriFactorT->solveInfo,
1093:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1095:   /* Then, solve L */
1096:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1098:                         loTriFactorT->csrMat->values->data().get(),
1099:                         loTriFactorT->csrMat->row_offsets->data().get(),
1100:                         loTriFactorT->csrMat->column_indices->data().get(),
1101:                         loTriFactorT->solveInfo,
1102:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1104:   /* restore */
1105:   VecCUDARestoreArrayRead(bb,&barray);
1106:   VecCUDARestoreArrayWrite(xx,&xarray);
1107:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1108:   PetscLogGpuTimeEnd();
1109:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1110:   return(0);
1111: }

1113: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1114: {
1115:   const PetscScalar                     *barray;
1116:   PetscScalar                           *xarray;
1117:   thrust::device_ptr<const PetscScalar> bGPU;
1118:   thrust::device_ptr<PetscScalar>       xGPU;
1119:   cusparseStatus_t                      stat;
1120:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1121:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1122:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1123:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1124:   PetscErrorCode                        ierr;
1125:   cudaError_t                           cerr;


1129:   /* Get the GPU pointers */
1130:   VecCUDAGetArrayWrite(xx,&xarray);
1131:   VecCUDAGetArrayRead(bb,&barray);
1132:   xGPU = thrust::device_pointer_cast(xarray);
1133:   bGPU = thrust::device_pointer_cast(barray);

1135:   PetscLogGpuTimeBegin();
1136:   /* First, reorder with the row permutation */
1137:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1138:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1139:                tempGPU->begin());

1141:   /* Next, solve L */
1142:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1143:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1144:                         loTriFactor->csrMat->values->data().get(),
1145:                         loTriFactor->csrMat->row_offsets->data().get(),
1146:                         loTriFactor->csrMat->column_indices->data().get(),
1147:                         loTriFactor->solveInfo,
1148:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1150:   /* Then, solve U */
1151:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1152:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1153:                         upTriFactor->csrMat->values->data().get(),
1154:                         upTriFactor->csrMat->row_offsets->data().get(),
1155:                         upTriFactor->csrMat->column_indices->data().get(),
1156:                         upTriFactor->solveInfo,
1157:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1159:   /* Last, reorder with the column permutation */
1160:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1161:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1162:                xGPU);

1164:   VecCUDARestoreArrayRead(bb,&barray);
1165:   VecCUDARestoreArrayWrite(xx,&xarray);
1166:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1167:   PetscLogGpuTimeEnd();
1168:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1169:   return(0);
1170: }

1172: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1173: {
1174:   const PetscScalar                 *barray;
1175:   PetscScalar                       *xarray;
1176:   cusparseStatus_t                  stat;
1177:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1178:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1179:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1180:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1181:   PetscErrorCode                    ierr;
1182:   cudaError_t                       cerr;

1185:   /* Get the GPU pointers */
1186:   VecCUDAGetArrayWrite(xx,&xarray);
1187:   VecCUDAGetArrayRead(bb,&barray);

1189:   PetscLogGpuTimeBegin();
1190:   /* First, solve L */
1191:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1192:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1193:                         loTriFactor->csrMat->values->data().get(),
1194:                         loTriFactor->csrMat->row_offsets->data().get(),
1195:                         loTriFactor->csrMat->column_indices->data().get(),
1196:                         loTriFactor->solveInfo,
1197:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1199:   /* Next, solve U */
1200:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1201:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1202:                         upTriFactor->csrMat->values->data().get(),
1203:                         upTriFactor->csrMat->row_offsets->data().get(),
1204:                         upTriFactor->csrMat->column_indices->data().get(),
1205:                         upTriFactor->solveInfo,
1206:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1208:   VecCUDARestoreArrayRead(bb,&barray);
1209:   VecCUDARestoreArrayWrite(xx,&xarray);
1210:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1211:   PetscLogGpuTimeEnd();
1212:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1213:   return(0);
1214: }

1216: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1217: {
1218:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1219:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1220:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1221:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1222:   PetscErrorCode               ierr;
1223:   cusparseStatus_t             stat;
1224:   cudaError_t                  err;

1227:   if (A->boundtocpu) return(0);
1228:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1229:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1230:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1231:       /* Copy values only */
1232:       CsrMatrix *mat,*matT;
1233:       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1234:       mat->values->assign(a->a, a->a+a->nz);
1235:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));

1237:       /* Update matT when it was built before */
1238:       if (cusparsestruct->matTranspose) {
1239:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1240:         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1241:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1242:                                  A->cmap->n, mat->num_entries,
1243:                                  mat->values->data().get(),
1244:                                  cusparsestruct->rowoffsets_gpu->data().get(),
1245:                                  mat->column_indices->data().get(),
1246:                                  matT->values->data().get(),
1247:                                  matT->column_indices->data().get(),
1248:                                  matT->row_offsets->data().get(),
1249:                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1250:       }
1251:     } else {
1252:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1253:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1254:       delete cusparsestruct->workVector;
1255:       delete cusparsestruct->rowoffsets_gpu;
1256:       try {
1257:         cusparsestruct->nonzerorow=0;
1258:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1260:         if (a->compressedrow.use) {
1261:           m    = a->compressedrow.nrows;
1262:           ii   = a->compressedrow.i;
1263:           ridx = a->compressedrow.rindex;
1264:         } else {
1265:           /* Forcing compressed row on the GPU */
1266:           int k=0;
1267:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1268:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1269:           ii[0]=0;
1270:           for (int j = 0; j<m; j++) {
1271:             if ((a->i[j+1]-a->i[j])>0) {
1272:               ii[k]  = a->i[j];
1273:               ridx[k]= j;
1274:               k++;
1275:             }
1276:           }
1277:           ii[cusparsestruct->nonzerorow] = a->nz;
1278:           m = cusparsestruct->nonzerorow;
1279:         }

1281:         /* allocate space for the triangular factor information */
1282:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1283:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1284:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1285:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1287:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1288:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1289:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1290:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1291:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1292:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1293:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1295:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1296:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1297:           /* set the matrix */
1298:           CsrMatrix *matrix= new CsrMatrix;
1299:           matrix->num_rows = m;
1300:           matrix->num_cols = A->cmap->n;
1301:           matrix->num_entries = a->nz;
1302:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1303:           matrix->row_offsets->assign(ii, ii + m+1);

1305:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1306:           matrix->column_indices->assign(a->j, a->j+a->nz);

1308:           matrix->values = new THRUSTARRAY(a->nz);
1309:           matrix->values->assign(a->a, a->a+a->nz);

1311:           /* assign the pointer */
1312:           matstruct->mat = matrix;

1314:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1315:           CsrMatrix *matrix= new CsrMatrix;
1316:           matrix->num_rows = m;
1317:           matrix->num_cols = A->cmap->n;
1318:           matrix->num_entries = a->nz;
1319:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1320:           matrix->row_offsets->assign(ii, ii + m+1);

1322:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1323:           matrix->column_indices->assign(a->j, a->j+a->nz);

1325:           matrix->values = new THRUSTARRAY(a->nz);
1326:           matrix->values->assign(a->a, a->a+a->nz);

1328:           cusparseHybMat_t hybMat;
1329:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1330:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1331:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1332:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1333:               matstruct->descr, matrix->values->data().get(),
1334:               matrix->row_offsets->data().get(),
1335:               matrix->column_indices->data().get(),
1336:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1337:           /* assign the pointer */
1338:           matstruct->mat = hybMat;

1340:           if (matrix) {
1341:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1342:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1343:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1344:             delete (CsrMatrix*)matrix;
1345:           }
1346:         }

1348:         /* assign the compressed row indices */
1349:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1350:         matstruct->cprowIndices->assign(ridx,ridx+m);
1351:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1353:         /* assign the pointer */
1354:         cusparsestruct->mat = matstruct;

1356:         if (!a->compressedrow.use) {
1357:           PetscFree(ii);
1358:           PetscFree(ridx);
1359:         }
1360:         cusparsestruct->workVector = new THRUSTARRAY(m);
1361:       } catch(char *ex) {
1362:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1363:       }
1364:       cusparsestruct->nonzerostate = A->nonzerostate;
1365:     }
1366:     err  = WaitForGPU();CHKERRCUDA(err);
1367:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1368:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1369:   }
1370:   return(0);
1371: }

1373: struct VecCUDAPlusEquals
1374: {
1375:   template <typename Tuple>
1376:   __host__ __device__
1377:   void operator()(Tuple t)
1378:   {
1379:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1380:   }
1381: };

1383: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1384: {

1388:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1389:   return(0);
1390: }

1392: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1393: {
1394:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1395:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1396:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1397:   const PetscScalar            *xarray;
1398:   PetscScalar                  *yarray;
1399:   PetscErrorCode               ierr;
1400:   cudaError_t                  cerr;
1401:   cusparseStatus_t             stat;

1404:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1405:   MatSeqAIJCUSPARSECopyToGPU(A);
1406:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1407:   if (!matstructT) {
1408:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1409:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1410:   }
1411:   VecCUDAGetArrayRead(xx,&xarray);
1412:   VecCUDAGetArrayWrite(yy,&yarray);
1413:   if (yy->map->n) {
1414:     PetscInt                     n = yy->map->n;
1415:     cudaError_t                  err;
1416:     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1417:   }

1419:   PetscLogGpuTimeBegin();
1420:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1421:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1422:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423:                              mat->num_rows, mat->num_cols,
1424:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1425:                              mat->values->data().get(), mat->row_offsets->data().get(),
1426:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1427:                              yarray);CHKERRCUSPARSE(stat);
1428:   } else {
1429:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1430:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1431:                              matstructT->alpha, matstructT->descr, hybMat,
1432:                              xarray, matstructT->beta_zero,
1433:                              yarray);CHKERRCUSPARSE(stat);
1434:   }
1435:   VecCUDARestoreArrayRead(xx,&xarray);
1436:   VecCUDARestoreArrayWrite(yy,&yarray);
1437:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1438:   PetscLogGpuTimeEnd();
1439:   PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1440:   return(0);
1441: }


1444: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1445: {
1446:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1447:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1448:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1449:   const PetscScalar            *xarray;
1450:   PetscScalar                  *zarray,*dptr,*beta;
1451:   PetscErrorCode               ierr;
1452:   cudaError_t                  cerr;
1453:   cusparseStatus_t             stat;
1454:   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */

1457:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1458:   MatSeqAIJCUSPARSECopyToGPU(A);
1459:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1460:   try {
1461:     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1462:     VecCUDAGetArrayRead(xx,&xarray);
1463:     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1464:       VecCUDAGetArray(zz,&zarray);
1465:     } else {
1466:       VecCUDAGetArrayWrite(zz,&zarray);
1467:     }
1468:     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1469:     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;

1471:     PetscLogGpuTimeBegin();
1472:     /* csr_spmv is multiply add */
1473:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1474:       /* here we need to be careful to set the number of rows in the multiply to the
1475:          number of compressed rows in the matrix ... which is equivalent to the
1476:          size of the workVector */
1477:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1478:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1479:                                mat->num_rows, mat->num_cols,
1480:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1481:                                mat->values->data().get(), mat->row_offsets->data().get(),
1482:                                mat->column_indices->data().get(), xarray, beta,
1483:                                dptr);CHKERRCUSPARSE(stat);
1484:     } else {
1485:       if (cusparsestruct->workVector->size()) {
1486:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1487:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1488:                                  matstruct->alpha, matstruct->descr, hybMat,
1489:                                  xarray, beta,
1490:                                  dptr);CHKERRCUSPARSE(stat);
1491:       }
1492:     }
1493:     PetscLogGpuTimeEnd();

1495:     if (yy) {
1496:       if (dptr != zarray) {
1497:         VecCopy_SeqCUDA(yy,zz);
1498:       } else if (zz != yy) {
1499:         VecAXPY_SeqCUDA(zz,1.0,yy);
1500:       }
1501:     } else if (dptr != zarray) {
1502:       VecSet_SeqCUDA(zz,0);
1503:     }
1504:     /* scatter the data from the temporary into the full vector with a += operation */
1505:     PetscLogGpuTimeBegin();
1506:     if (dptr != zarray) {
1507:       thrust::device_ptr<PetscScalar> zptr;

1509:       zptr = thrust::device_pointer_cast(zarray);
1510:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1511:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1512:                        VecCUDAPlusEquals());
1513:     }
1514:     PetscLogGpuTimeEnd();
1515:     VecCUDARestoreArrayRead(xx,&xarray);
1516:     if (yy && !cmpr) {
1517:       VecCUDARestoreArray(zz,&zarray);
1518:     } else {
1519:       VecCUDARestoreArrayWrite(zz,&zarray);
1520:     }
1521:   } catch(char *ex) {
1522:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1523:   }
1524:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1525:   PetscLogGpuFlops(2.0*a->nz);
1526:   return(0);
1527: }

1529: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1530: {
1531:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1532:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1533:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1534:   const PetscScalar               *xarray;
1535:   PetscScalar                     *zarray,*beta;
1536:   PetscErrorCode                  ierr;
1537:   cudaError_t                     cerr;
1538:   cusparseStatus_t                stat;

1541:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1542:   MatSeqAIJCUSPARSECopyToGPU(A);
1543:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544:   if (!matstructT) {
1545:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1546:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1547:   }

1549:   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1550:   try {
1551:     VecCopy_SeqCUDA(yy,zz);
1552:     VecCUDAGetArrayRead(xx,&xarray);
1553:     VecCUDAGetArray(zz,&zarray);
1554:     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;

1556:     PetscLogGpuTimeBegin();
1557:     /* multiply add with matrix transpose */
1558:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1559:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1560:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1561:                                mat->num_rows, mat->num_cols,
1562:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1563:                                mat->values->data().get(), mat->row_offsets->data().get(),
1564:                                mat->column_indices->data().get(), xarray, beta,
1565:                                zarray);CHKERRCUSPARSE(stat);
1566:     } else {
1567:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1568:       if (cusparsestruct->workVector->size()) {
1569:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1570:                                  matstructT->alpha, matstructT->descr, hybMat,
1571:                                  xarray, beta,
1572:                                  zarray);CHKERRCUSPARSE(stat);
1573:       }
1574:     }
1575:     PetscLogGpuTimeEnd();

1577:     if (zz != yy) {VecAXPY_SeqCUDA(zz,1.0,yy);}
1578:     VecCUDARestoreArrayRead(xx,&xarray);
1579:     VecCUDARestoreArray(zz,&zarray);
1580:   } catch(char *ex) {
1581:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1582:   }
1583:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1584:   PetscLogGpuFlops(2.0*a->nz);
1585:   return(0);
1586: }

1588: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1589: {

1593:   MatAssemblyEnd_SeqAIJ(A,mode);
1594:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
1595:   if (A->factortype == MAT_FACTOR_NONE) {
1596:     MatSeqAIJCUSPARSECopyToGPU(A);
1597:   }
1598:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1599:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1600:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1601:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1602:   return(0);
1603: }

1605: /* --------------------------------------------------------------------------------*/
1606: /*@
1607:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1608:    (the default parallel PETSc format). This matrix will ultimately pushed down
1609:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1610:    assembly performance the user should preallocate the matrix storage by setting
1611:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1612:    performance during matrix assembly can be increased by more than a factor of 50.

1614:    Collective

1616:    Input Parameters:
1617: +  comm - MPI communicator, set to PETSC_COMM_SELF
1618: .  m - number of rows
1619: .  n - number of columns
1620: .  nz - number of nonzeros per row (same for all rows)
1621: -  nnz - array containing the number of nonzeros in the various rows
1622:          (possibly different for each row) or NULL

1624:    Output Parameter:
1625: .  A - the matrix

1627:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1628:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1629:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1631:    Notes:
1632:    If nnz is given then nz is ignored

1634:    The AIJ format (also called the Yale sparse matrix format or
1635:    compressed row storage), is fully compatible with standard Fortran 77
1636:    storage.  That is, the stored row and column indices can begin at
1637:    either one (as in Fortran) or zero.  See the users' manual for details.

1639:    Specify the preallocated storage with either nz or nnz (not both).
1640:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1641:    allocation.  For large problems you MUST preallocate memory or you
1642:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1644:    By default, this format uses inodes (identical nodes) when possible, to
1645:    improve numerical efficiency of matrix-vector products and solves. We
1646:    search for consecutive rows with the same nonzero structure, thereby
1647:    reusing matrix information to achieve increased efficiency.

1649:    Level: intermediate

1651: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1652: @*/
1653: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1654: {

1658:   MatCreate(comm,A);
1659:   MatSetSizes(*A,m,n,m,n);
1660:   MatSetType(*A,MATSEQAIJCUSPARSE);
1661:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1662:   return(0);
1663: }

1665: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1666: {
1667:   PetscErrorCode   ierr;

1670:   if (A->factortype==MAT_FACTOR_NONE) {
1671:     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1672:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1673:     }
1674:   } else {
1675:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1676:   }
1677:   MatDestroy_SeqAIJ(A);
1678:   return(0);
1679: }

1681: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1682: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1683: {
1685:   Mat C;
1686:   cusparseStatus_t stat;
1687:   cusparseHandle_t handle=0;

1690:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1691:   C    = *B;
1692:   PetscFree(C->defaultvectype);
1693:   PetscStrallocpy(VECCUDA,&C->defaultvectype);

1695:   /* inject CUSPARSE-specific stuff */
1696:   if (C->factortype==MAT_FACTOR_NONE) {
1697:     /* you cannot check the inode.use flag here since the matrix was just created.
1698:        now build a GPU matrix data structure */
1699:     C->spptr = new Mat_SeqAIJCUSPARSE;
1700:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat            = 0;
1701:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose   = 0;
1702:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector     = 0;
1703:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1704:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format         = MAT_CUSPARSE_CSR;
1705:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream         = 0;
1706:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1707:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle         = handle;
1708:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate   = 0;
1709:   } else {
1710:     /* NEXT, set the pointers to the triangular factors */
1711:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1712:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1713:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1714:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1715:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1716:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1717:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1718:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1719:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1720:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1721:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1722:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1723:   }

1725:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1726:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1727:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1728:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1729:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1730:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1731:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1732:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1733:   C->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;

1735:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1737:   C->boundtocpu  = PETSC_FALSE;
1738:   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1740:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1741:   return(0);
1742: }


1745: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1746: {
1748:   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1749:      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1750:      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1751:      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1752:            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1753:   if (A->offloadmask == PETSC_OFFLOAD_GPU) return(0);
1754:   if (flg) {
1755:     A->ops->mult             = MatMult_SeqAIJ;
1756:     A->ops->multadd          = MatMultAdd_SeqAIJ;
1757:     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1758:     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1759:     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1760:     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1761:   } else {
1762:     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1763:     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1764:     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1765:     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1766:     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1767:     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1768:     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1769:   }
1770:   A->boundtocpu = flg;
1771:   return(0);
1772: }

1774: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1775: {
1777:   cusparseStatus_t stat;
1778:   cusparseHandle_t handle=0;

1781:   PetscFree(B->defaultvectype);
1782:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1784:   if (B->factortype==MAT_FACTOR_NONE) {
1785:     /* you cannot check the inode.use flag here since the matrix was just created.
1786:        now build a GPU matrix data structure */
1787:     B->spptr = new Mat_SeqAIJCUSPARSE;
1788:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1789:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1790:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1791:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1792:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1793:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1794:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1795:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1796:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1797:   } else {
1798:     /* NEXT, set the pointers to the triangular factors */
1799:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1800:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1801:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1802:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1803:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1804:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1805:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1806:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1807:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1808:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1809:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1810:   }

1812:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1813:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1814:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1815:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1816:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1817:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1818:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1819:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1820:   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;

1822:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1824:   B->boundtocpu  = PETSC_FALSE;
1825:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1827:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1828:   return(0);
1829: }

1831: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1832: {

1836:   MatCreate_SeqAIJ(B);
1837:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1838:   return(0);
1839: }

1841: /*MC
1842:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1844:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1845:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1846:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1848:    Options Database Keys:
1849: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1850: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1851: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1853:   Level: beginner

1855: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1856: M*/

1858: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1861: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1862: {

1866:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1867:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1868:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1869:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1870:   return(0);
1871: }


1874: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1875: {
1876:   cusparseStatus_t stat;
1877:   cusparseHandle_t handle;

1880:   if (*cusparsestruct) {
1881:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1882:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1883:     delete (*cusparsestruct)->workVector;
1884:     delete (*cusparsestruct)->rowoffsets_gpu;
1885:     if (handle = (*cusparsestruct)->handle) {
1886:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1887:     }
1888:     delete *cusparsestruct;
1889:     *cusparsestruct = 0;
1890:   }
1891:   return(0);
1892: }

1894: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1895: {
1897:   if (*mat) {
1898:     delete (*mat)->values;
1899:     delete (*mat)->column_indices;
1900:     delete (*mat)->row_offsets;
1901:     delete *mat;
1902:     *mat = 0;
1903:   }
1904:   return(0);
1905: }

1907: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1908: {
1909:   cusparseStatus_t stat;
1910:   PetscErrorCode   ierr;

1913:   if (*trifactor) {
1914:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1915:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1916:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1917:     delete *trifactor;
1918:     *trifactor = 0;
1919:   }
1920:   return(0);
1921: }

1923: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1924: {
1925:   CsrMatrix        *mat;
1926:   cusparseStatus_t stat;
1927:   cudaError_t      err;

1930:   if (*matstruct) {
1931:     if ((*matstruct)->mat) {
1932:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1933:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1934:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1935:       } else {
1936:         mat = (CsrMatrix*)(*matstruct)->mat;
1937:         CsrMatrix_Destroy(&mat);
1938:       }
1939:     }
1940:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1941:     delete (*matstruct)->cprowIndices;
1942:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1943:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1944:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1945:     delete *matstruct;
1946:     *matstruct = 0;
1947:   }
1948:   return(0);
1949: }

1951: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1952: {
1953:   cusparseHandle_t handle;
1954:   cusparseStatus_t stat;

1957:   if (*trifactors) {
1958:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1959:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1960:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1961:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1962:     delete (*trifactors)->rpermIndices;
1963:     delete (*trifactors)->cpermIndices;
1964:     delete (*trifactors)->workVector;
1965:     if (handle = (*trifactors)->handle) {
1966:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1967:     }
1968:     delete *trifactors;
1969:     *trifactors = 0;
1970:   }
1971:   return(0);
1972: }