Actual source code: aijcusparse.cu
petsc-3.7.3 2016-08-01
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44: {
45: cusparseStatus_t stat;
46: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
49: cusparsestruct->stream = stream;
50: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51: return(0);
52: }
56: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57: {
58: cusparseStatus_t stat;
59: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62: if (cusparsestruct->handle)
63: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64: cusparsestruct->handle = handle;
65: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66: return(0);
67: }
71: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72: {
73: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
75: if (cusparsestruct->handle)
76: cusparsestruct->handle = 0;
77: return(0);
78: }
82: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
83: {
85: *type = MATSOLVERCUSPARSE;
86: return(0);
87: }
89: /*MC
90: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
91: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
92: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
93: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
94: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
95: algorithms are not recommended. This class does NOT support direct solver operations.
97: Level: beginner
99: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
100: M*/
104: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
105: {
107: PetscInt n = A->rmap->n;
110: MatCreate(PetscObjectComm((PetscObject)A),B);
111: (*B)->factortype = ftype;
112: MatSetSizes(*B,n,n,n,n);
113: MatSetType(*B,MATSEQAIJCUSPARSE);
115: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116: MatSetBlockSizesFromMats(*B,A,A);
117: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
125: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
126: return(0);
127: }
131: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
132: {
133: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
136: #if CUDA_VERSION>=4020
137: switch (op) {
138: case MAT_CUSPARSE_MULT:
139: cusparsestruct->format = format;
140: break;
141: case MAT_CUSPARSE_ALL:
142: cusparsestruct->format = format;
143: break;
144: default:
145: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
146: }
147: #else
148: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149: #endif
150: return(0);
151: }
153: /*@
154: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155: operation. Only the MatMult operation can use different GPU storage formats
156: for MPIAIJCUSPARSE matrices.
157: Not Collective
159: Input Parameters:
160: + A - Matrix of type SEQAIJCUSPARSE
161: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164: Output Parameter:
166: Level: intermediate
168: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169: @*/
172: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173: {
178: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
179: return(0);
180: }
184: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
185: {
186: PetscErrorCode ierr;
187: MatCUSPARSEStorageFormat format;
188: PetscBool flg;
189: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
192: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
193: PetscObjectOptionsBegin((PetscObject)A);
194: if (A->factortype==MAT_FACTOR_NONE) {
195: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
196: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
197: if (flg) {
198: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
199: }
200: }
201: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
202: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
203: if (flg) {
204: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
205: }
206: PetscOptionsEnd();
207: return(0);
209: }
213: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
214: {
218: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
219: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
220: return(0);
221: }
225: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
226: {
230: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
231: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
232: return(0);
233: }
237: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
238: {
242: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
243: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
244: return(0);
245: }
249: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
250: {
254: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
255: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
256: return(0);
257: }
261: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
264: PetscInt n = A->rmap->n;
265: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
266: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
267: cusparseStatus_t stat;
268: const PetscInt *ai = a->i,*aj = a->j,*vi;
269: const MatScalar *aa = a->a,*v;
270: PetscInt *AiLo, *AjLo;
271: PetscScalar *AALo;
272: PetscInt i,nz, nzLower, offset, rowOffset;
273: PetscErrorCode ierr;
276: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
277: try {
278: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279: nzLower=n+ai[n]-ai[1];
281: /* Allocate Space for the lower triangular matrix */
282: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
283: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
284: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
286: /* Fill the lower triangular matrix */
287: AiLo[0] = (PetscInt) 0;
288: AiLo[n] = nzLower;
289: AjLo[0] = (PetscInt) 0;
290: AALo[0] = (MatScalar) 1.0;
291: v = aa;
292: vi = aj;
293: offset = 1;
294: rowOffset= 1;
295: for (i=1; i<n; i++) {
296: nz = ai[i+1] - ai[i];
297: /* additional 1 for the term on the diagonal */
298: AiLo[i] = rowOffset;
299: rowOffset += nz+1;
301: PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
302: PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));
304: offset += nz;
305: AjLo[offset] = (PetscInt) i;
306: AALo[offset] = (MatScalar) 1.0;
307: offset += 1;
309: v += nz;
310: vi += nz;
311: }
313: /* allocate space for the triangular factor information */
314: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316: /* Create the matrix description */
317: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
318: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
319: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
320: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
321: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
323: /* Create the solve analysis information */
324: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
326: /* set the operation */
327: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329: /* set the matrix */
330: loTriFactor->csrMat = new CsrMatrix;
331: loTriFactor->csrMat->num_rows = n;
332: loTriFactor->csrMat->num_cols = n;
333: loTriFactor->csrMat->num_entries = nzLower;
335: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
336: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344: /* perform the solve analysis */
345: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
346: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
347: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
348: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
350: /* assign the pointer. Is this really necessary? */
351: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
354: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
355: cudaFreeHost(AALo);CHKERRCUDA(ierr);
356: } catch(char *ex) {
357: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
358: }
359: }
360: return(0);
361: }
365: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
366: {
367: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
368: PetscInt n = A->rmap->n;
369: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
371: cusparseStatus_t stat;
372: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
373: const MatScalar *aa = a->a,*v;
374: PetscInt *AiUp, *AjUp;
375: PetscScalar *AAUp;
376: PetscInt i,nz, nzUpper, offset;
377: PetscErrorCode ierr;
380: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
381: try {
382: /* next, figure out the number of nonzeros in the upper triangular matrix. */
383: nzUpper = adiag[0]-adiag[n];
385: /* Allocate Space for the upper triangular matrix */
386: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
387: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
388: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
390: /* Fill the upper triangular matrix */
391: AiUp[0]=(PetscInt) 0;
392: AiUp[n]=nzUpper;
393: offset = nzUpper;
394: for (i=n-1; i>=0; i--) {
395: v = aa + adiag[i+1] + 1;
396: vi = aj + adiag[i+1] + 1;
398: /* number of elements NOT on the diagonal */
399: nz = adiag[i] - adiag[i+1]-1;
401: /* decrement the offset */
402: offset -= (nz+1);
404: /* first, set the diagonal elements */
405: AjUp[offset] = (PetscInt) i;
406: AAUp[offset] = (MatScalar)1./v[nz];
407: AiUp[i] = AiUp[i+1] - (nz+1);
409: PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
410: PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
411: }
413: /* allocate space for the triangular factor information */
414: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416: /* Create the matrix description */
417: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
418: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
419: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
420: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
421: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
423: /* Create the solve analysis information */
424: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
426: /* set the operation */
427: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429: /* set the matrix */
430: upTriFactor->csrMat = new CsrMatrix;
431: upTriFactor->csrMat->num_rows = n;
432: upTriFactor->csrMat->num_cols = n;
433: upTriFactor->csrMat->num_entries = nzUpper;
435: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
436: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
439: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444: /* perform the solve analysis */
445: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
446: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
447: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
448: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
450: /* assign the pointer. Is this really necessary? */
451: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
454: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
455: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
456: } catch(char *ex) {
457: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
458: }
459: }
460: return(0);
461: }
465: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
466: {
467: PetscErrorCode ierr;
468: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
469: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
470: IS isrow = a->row,iscol = a->icol;
471: PetscBool row_identity,col_identity;
472: const PetscInt *r,*c;
473: PetscInt n = A->rmap->n;
476: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
477: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
479: cusparseTriFactors->workVector = new THRUSTARRAY;
480: cusparseTriFactors->workVector->resize(n);
481: cusparseTriFactors->nnz=a->nz;
483: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
484: /*lower triangular indices */
485: ISGetIndices(isrow,&r);
486: ISIdentity(isrow,&row_identity);
487: if (!row_identity) {
488: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
489: cusparseTriFactors->rpermIndices->assign(r, r+n);
490: }
491: ISRestoreIndices(isrow,&r);
493: /*upper triangular indices */
494: ISGetIndices(iscol,&c);
495: ISIdentity(iscol,&col_identity);
496: if (!col_identity) {
497: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
498: cusparseTriFactors->cpermIndices->assign(c, c+n);
499: }
500: ISRestoreIndices(iscol,&c);
501: return(0);
502: }
506: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
507: {
508: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
509: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
510: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
511: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
512: cusparseStatus_t stat;
513: PetscErrorCode ierr;
514: PetscInt *AiUp, *AjUp;
515: PetscScalar *AAUp;
516: PetscScalar *AALo;
517: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
518: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
519: const PetscInt *ai = b->i,*aj = b->j,*vj;
520: const MatScalar *aa = b->a,*v;
523: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
524: try {
525: /* Allocate Space for the upper triangular matrix */
526: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
527: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
528: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
529: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
531: /* Fill the upper triangular matrix */
532: AiUp[0]=(PetscInt) 0;
533: AiUp[n]=nzUpper;
534: offset = 0;
535: for (i=0; i<n; i++) {
536: /* set the pointers */
537: v = aa + ai[i];
538: vj = aj + ai[i];
539: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541: /* first, set the diagonal elements */
542: AjUp[offset] = (PetscInt) i;
543: AAUp[offset] = (MatScalar)1.0/v[nz];
544: AiUp[i] = offset;
545: AALo[offset] = (MatScalar)1.0/v[nz];
547: offset+=1;
548: if (nz>0) {
549: PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
550: PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
551: for (j=offset; j<offset+nz; j++) {
552: AAUp[j] = -AAUp[j];
553: AALo[j] = AAUp[j]/v[nz];
554: }
555: offset+=nz;
556: }
557: }
559: /* allocate space for the triangular factor information */
560: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562: /* Create the matrix description */
563: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
564: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
565: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
566: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
567: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
569: /* Create the solve analysis information */
570: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
572: /* set the operation */
573: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575: /* set the matrix */
576: upTriFactor->csrMat = new CsrMatrix;
577: upTriFactor->csrMat->num_rows = A->rmap->n;
578: upTriFactor->csrMat->num_cols = A->cmap->n;
579: upTriFactor->csrMat->num_entries = a->nz;
581: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
582: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
585: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
588: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590: /* perform the solve analysis */
591: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
592: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
593: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
594: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
596: /* assign the pointer. Is this really necessary? */
597: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599: /* allocate space for the triangular factor information */
600: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602: /* Create the matrix description */
603: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
604: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
605: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
606: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
607: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
609: /* Create the solve analysis information */
610: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
612: /* set the operation */
613: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615: /* set the matrix */
616: loTriFactor->csrMat = new CsrMatrix;
617: loTriFactor->csrMat->num_rows = A->rmap->n;
618: loTriFactor->csrMat->num_cols = A->cmap->n;
619: loTriFactor->csrMat->num_entries = a->nz;
621: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
622: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
625: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
628: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630: /* perform the solve analysis */
631: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
632: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
633: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
634: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
636: /* assign the pointer. Is this really necessary? */
637: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
640: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
641: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
642: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
643: cudaFreeHost(AALo);CHKERRCUDA(ierr);
644: } catch(char *ex) {
645: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
646: }
647: }
648: return(0);
649: }
653: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
654: {
655: PetscErrorCode ierr;
656: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
657: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
658: IS ip = a->row;
659: const PetscInt *rip;
660: PetscBool perm_identity;
661: PetscInt n = A->rmap->n;
664: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
665: cusparseTriFactors->workVector = new THRUSTARRAY;
666: cusparseTriFactors->workVector->resize(n);
667: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669: /*lower triangular indices */
670: ISGetIndices(ip,&rip);
671: ISIdentity(ip,&perm_identity);
672: if (!perm_identity) {
673: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
674: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
675: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676: cusparseTriFactors->cpermIndices->assign(rip, rip+n);
677: }
678: ISRestoreIndices(ip,&rip);
679: return(0);
680: }
684: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
685: {
686: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
687: IS isrow = b->row,iscol = b->col;
688: PetscBool row_identity,col_identity;
692: MatLUFactorNumeric_SeqAIJ(B,A,info);
693: /* determine which version of MatSolve needs to be used. */
694: ISIdentity(isrow,&row_identity);
695: ISIdentity(iscol,&col_identity);
696: if (row_identity && col_identity) {
697: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
698: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
699: } else {
700: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
701: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
702: }
704: /* get the triangular factors */
705: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
706: return(0);
707: }
711: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
712: {
713: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
714: IS ip = b->row;
715: PetscBool perm_identity;
719: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
721: /* determine which version of MatSolve needs to be used. */
722: ISIdentity(ip,&perm_identity);
723: if (perm_identity) {
724: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
725: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
726: } else {
727: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
728: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
729: }
731: /* get the triangular factors */
732: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
733: return(0);
734: }
738: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
739: {
740: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
741: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
742: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
743: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
744: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
745: cusparseStatus_t stat;
746: cusparseIndexBase_t indexBase;
747: cusparseMatrixType_t matrixType;
748: cusparseFillMode_t fillMode;
749: cusparseDiagType_t diagType;
753: /*********************************************/
754: /* Now the Transpose of the Lower Tri Factor */
755: /*********************************************/
757: /* allocate space for the transpose of the lower triangular factor */
758: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760: /* set the matrix descriptors of the lower triangular factor */
761: matrixType = cusparseGetMatType(loTriFactor->descr);
762: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
763: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
764: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
765: diagType = cusparseGetMatDiagType(loTriFactor->descr);
767: /* Create the matrix description */
768: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
769: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
770: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
771: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
772: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
774: /* Create the solve analysis information */
775: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
777: /* set the operation */
778: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780: /* allocate GPU space for the CSC of the lower triangular factor*/
781: loTriFactorT->csrMat = new CsrMatrix;
782: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
783: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
784: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
785: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
786: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
787: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789: /* compute the transpose of the lower triangular factor, i.e. the CSC */
790: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
791: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
792: loTriFactor->csrMat->values->data().get(),
793: loTriFactor->csrMat->row_offsets->data().get(),
794: loTriFactor->csrMat->column_indices->data().get(),
795: loTriFactorT->csrMat->values->data().get(),
796: loTriFactorT->csrMat->column_indices->data().get(),
797: loTriFactorT->csrMat->row_offsets->data().get(),
798: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
800: /* perform the solve analysis on the transposed matrix */
801: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
802: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
803: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
804: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
805: loTriFactorT->solveInfo);CHKERRCUDA(stat);
807: /* assign the pointer. Is this really necessary? */
808: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810: /*********************************************/
811: /* Now the Transpose of the Upper Tri Factor */
812: /*********************************************/
814: /* allocate space for the transpose of the upper triangular factor */
815: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817: /* set the matrix descriptors of the upper triangular factor */
818: matrixType = cusparseGetMatType(upTriFactor->descr);
819: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
820: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
821: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
822: diagType = cusparseGetMatDiagType(upTriFactor->descr);
824: /* Create the matrix description */
825: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
826: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
827: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
828: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
829: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
831: /* Create the solve analysis information */
832: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
834: /* set the operation */
835: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837: /* allocate GPU space for the CSC of the upper triangular factor*/
838: upTriFactorT->csrMat = new CsrMatrix;
839: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
840: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
841: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
842: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
843: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
844: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846: /* compute the transpose of the upper triangular factor, i.e. the CSC */
847: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
848: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
849: upTriFactor->csrMat->values->data().get(),
850: upTriFactor->csrMat->row_offsets->data().get(),
851: upTriFactor->csrMat->column_indices->data().get(),
852: upTriFactorT->csrMat->values->data().get(),
853: upTriFactorT->csrMat->column_indices->data().get(),
854: upTriFactorT->csrMat->row_offsets->data().get(),
855: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
857: /* perform the solve analysis on the transposed matrix */
858: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
859: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
860: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
861: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
862: upTriFactorT->solveInfo);CHKERRCUDA(stat);
864: /* assign the pointer. Is this really necessary? */
865: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
866: return(0);
867: }
871: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
872: {
873: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
874: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
875: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
876: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
877: cusparseStatus_t stat;
878: cusparseIndexBase_t indexBase;
879: cudaError_t err;
883: /* allocate space for the triangular factor information */
884: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
885: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
886: indexBase = cusparseGetMatIndexBase(matstruct->descr);
887: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
888: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
890: /* set alpha and beta */
891: err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
892: err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
893: err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
894: err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
895: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
897: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
898: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
899: CsrMatrix *matrixT= new CsrMatrix;
900: matrixT->num_rows = A->rmap->n;
901: matrixT->num_cols = A->cmap->n;
902: matrixT->num_entries = a->nz;
903: matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
905: matrixT->values = new THRUSTARRAY(a->nz);
907: /* compute the transpose of the upper triangular factor, i.e. the CSC */
908: indexBase = cusparseGetMatIndexBase(matstruct->descr);
909: stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
910: matrix->num_cols, matrix->num_entries,
911: matrix->values->data().get(),
912: matrix->row_offsets->data().get(),
913: matrix->column_indices->data().get(),
914: matrixT->values->data().get(),
915: matrixT->column_indices->data().get(),
916: matrixT->row_offsets->data().get(),
917: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919: /* assign the pointer */
920: matstructT->mat = matrixT;
922: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
923: #if CUDA_VERSION>=5000
924: /* First convert HYB to CSR */
925: CsrMatrix *temp= new CsrMatrix;
926: temp->num_rows = A->rmap->n;
927: temp->num_cols = A->cmap->n;
928: temp->num_entries = a->nz;
929: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
930: temp->column_indices = new THRUSTINTARRAY32(a->nz);
931: temp->values = new THRUSTARRAY(a->nz);
934: stat = cusparse_hyb2csr(cusparsestruct->handle,
935: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
936: temp->values->data().get(),
937: temp->row_offsets->data().get(),
938: temp->column_indices->data().get());CHKERRCUDA(stat);
940: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
941: CsrMatrix *tempT= new CsrMatrix;
942: tempT->num_rows = A->rmap->n;
943: tempT->num_cols = A->cmap->n;
944: tempT->num_entries = a->nz;
945: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
946: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
947: tempT->values = new THRUSTARRAY(a->nz);
949: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
950: temp->num_cols, temp->num_entries,
951: temp->values->data().get(),
952: temp->row_offsets->data().get(),
953: temp->column_indices->data().get(),
954: tempT->values->data().get(),
955: tempT->column_indices->data().get(),
956: tempT->row_offsets->data().get(),
957: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
959: /* Last, convert CSC to HYB */
960: cusparseHybMat_t hybMat;
961: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
962: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
963: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
964: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
965: matstructT->descr, tempT->values->data().get(),
966: tempT->row_offsets->data().get(),
967: tempT->column_indices->data().get(),
968: hybMat, 0, partition);CHKERRCUDA(stat);
970: /* assign the pointer */
971: matstructT->mat = hybMat;
973: /* delete temporaries */
974: if (tempT) {
975: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
976: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
977: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
978: delete (CsrMatrix*) tempT;
979: }
980: if (temp) {
981: if (temp->values) delete (THRUSTARRAY*) temp->values;
982: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
983: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
984: delete (CsrMatrix*) temp;
985: }
986: #else
987: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
988: #endif
989: }
990: /* assign the compressed row indices */
991: matstructT->cprowIndices = new THRUSTINTARRAY;
993: /* assign the pointer */
994: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995: return(0);
996: }
1000: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001: {
1002: PetscInt n = xx->map->n;
1003: PetscScalar *xarray, *barray;
1004: thrust::device_ptr<PetscScalar> xGPU,bGPU;
1005: cusparseStatus_t stat;
1006: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1007: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1010: PetscErrorCode ierr;
1013: /* Analyze the matrix and create the transpose ... on the fly */
1014: if (!loTriFactorT && !upTriFactorT) {
1015: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1016: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1017: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1018: }
1020: /* Get the GPU pointers */
1021: VecCUDAGetArrayWrite(xx,&xarray);
1022: VecCUDAGetArrayRead(bb,&barray);
1023: xGPU = thrust::device_pointer_cast(xarray);
1024: bGPU = thrust::device_pointer_cast(barray);
1026: /* First, reorder with the row permutation */
1027: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1028: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1029: xGPU);
1031: /* First, solve U */
1032: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1033: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1034: upTriFactorT->csrMat->values->data().get(),
1035: upTriFactorT->csrMat->row_offsets->data().get(),
1036: upTriFactorT->csrMat->column_indices->data().get(),
1037: upTriFactorT->solveInfo,
1038: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1040: /* Then, solve L */
1041: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1042: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1043: loTriFactorT->csrMat->values->data().get(),
1044: loTriFactorT->csrMat->row_offsets->data().get(),
1045: loTriFactorT->csrMat->column_indices->data().get(),
1046: loTriFactorT->solveInfo,
1047: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1049: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1050: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1051: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1052: tempGPU->begin());
1054: /* Copy the temporary to the full solution. */
1055: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1057: /* restore */
1058: VecCUDARestoreArrayRead(bb,&barray);
1059: VecCUDARestoreArrayWrite(xx,&xarray);
1060: WaitForGPU();CHKERRCUDA(ierr);
1062: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1063: return(0);
1064: }
1068: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1069: {
1070: PetscScalar *xarray, *barray;
1071: cusparseStatus_t stat;
1072: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1073: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1074: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1075: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1076: PetscErrorCode ierr;
1079: /* Analyze the matrix and create the transpose ... on the fly */
1080: if (!loTriFactorT && !upTriFactorT) {
1081: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1082: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1083: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1084: }
1086: /* Get the GPU pointers */
1087: VecCUDAGetArrayWrite(xx,&xarray);
1088: VecCUDAGetArrayRead(bb,&barray);
1090: /* First, solve U */
1091: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1092: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1093: upTriFactorT->csrMat->values->data().get(),
1094: upTriFactorT->csrMat->row_offsets->data().get(),
1095: upTriFactorT->csrMat->column_indices->data().get(),
1096: upTriFactorT->solveInfo,
1097: barray, tempGPU->data().get());CHKERRCUDA(stat);
1099: /* Then, solve L */
1100: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1101: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1102: loTriFactorT->csrMat->values->data().get(),
1103: loTriFactorT->csrMat->row_offsets->data().get(),
1104: loTriFactorT->csrMat->column_indices->data().get(),
1105: loTriFactorT->solveInfo,
1106: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1108: /* restore */
1109: VecCUDARestoreArrayRead(bb,&barray);
1110: VecCUDARestoreArrayWrite(xx,&xarray);
1111: WaitForGPU();CHKERRCUDA(ierr);
1112: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1113: return(0);
1114: }
1118: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1119: {
1120: PetscScalar *xarray, *barray;
1121: thrust::device_ptr<PetscScalar> xGPU,bGPU;
1122: cusparseStatus_t stat;
1123: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1124: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1125: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1126: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1127: PetscErrorCode ierr;
1128: VecType t;
1129: PetscBool flg;
1132: VecGetType(bb,&t);
1133: PetscStrcmp(t,VECSEQCUDA,&flg);
1134: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1135: VecGetType(xx,&t);
1136: PetscStrcmp(t,VECSEQCUDA,&flg);
1137: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1139: /* Get the GPU pointers */
1140: VecCUDAGetArrayWrite(xx,&xarray);
1141: VecCUDAGetArrayRead(bb,&barray);
1142: xGPU = thrust::device_pointer_cast(xarray);
1143: bGPU = thrust::device_pointer_cast(barray);
1145: /* First, reorder with the row permutation */
1146: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1147: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1148: xGPU);
1150: /* Next, solve L */
1151: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1152: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1153: loTriFactor->csrMat->values->data().get(),
1154: loTriFactor->csrMat->row_offsets->data().get(),
1155: loTriFactor->csrMat->column_indices->data().get(),
1156: loTriFactor->solveInfo,
1157: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1159: /* Then, solve U */
1160: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1161: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1162: upTriFactor->csrMat->values->data().get(),
1163: upTriFactor->csrMat->row_offsets->data().get(),
1164: upTriFactor->csrMat->column_indices->data().get(),
1165: upTriFactor->solveInfo,
1166: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1168: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1169: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1170: thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1171: tempGPU->begin());
1173: /* Copy the temporary to the full solution. */
1174: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1176: VecCUDARestoreArrayRead(bb,&barray);
1177: VecCUDARestoreArrayWrite(xx,&xarray);
1178: WaitForGPU();CHKERRCUDA(ierr);
1179: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1180: return(0);
1181: }
1185: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1186: {
1187: PetscScalar *xarray, *barray;
1188: cusparseStatus_t stat;
1189: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1190: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1191: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1192: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1193: PetscErrorCode ierr;
1196: /* Get the GPU pointers */
1197: VecCUDAGetArrayWrite(xx,&xarray);
1198: VecCUDAGetArrayRead(bb,&barray);
1200: /* First, solve L */
1201: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1203: loTriFactor->csrMat->values->data().get(),
1204: loTriFactor->csrMat->row_offsets->data().get(),
1205: loTriFactor->csrMat->column_indices->data().get(),
1206: loTriFactor->solveInfo,
1207: barray, tempGPU->data().get());CHKERRCUDA(stat);
1209: /* Next, solve U */
1210: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1212: upTriFactor->csrMat->values->data().get(),
1213: upTriFactor->csrMat->row_offsets->data().get(),
1214: upTriFactor->csrMat->column_indices->data().get(),
1215: upTriFactor->solveInfo,
1216: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1218: VecCUDARestoreArrayRead(bb,&barray);
1219: VecCUDARestoreArrayWrite(xx,&xarray);
1220: WaitForGPU();CHKERRCUDA(ierr);
1221: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1222: return(0);
1223: }
1227: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1228: {
1230: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1231: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1232: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1233: PetscInt m = A->rmap->n,*ii,*ridx;
1234: PetscErrorCode ierr;
1235: cusparseStatus_t stat;
1236: cudaError_t err;
1239: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1240: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1241: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1242: try {
1243: cusparsestruct->nonzerorow=0;
1244: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1246: if (a->compressedrow.use) {
1247: m = a->compressedrow.nrows;
1248: ii = a->compressedrow.i;
1249: ridx = a->compressedrow.rindex;
1250: } else {
1251: /* Forcing compressed row on the GPU */
1252: int k=0;
1253: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1254: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1255: ii[0]=0;
1256: for (int j = 0; j<m; j++) {
1257: if ((a->i[j+1]-a->i[j])>0) {
1258: ii[k] = a->i[j];
1259: ridx[k]= j;
1260: k++;
1261: }
1262: }
1263: ii[cusparsestruct->nonzerorow] = a->nz;
1264: m = cusparsestruct->nonzerorow;
1265: }
1267: /* allocate space for the triangular factor information */
1268: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1269: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1270: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1271: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1273: err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1274: err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1275: err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1276: err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1277: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1279: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1280: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1281: /* set the matrix */
1282: CsrMatrix *matrix= new CsrMatrix;
1283: matrix->num_rows = m;
1284: matrix->num_cols = A->cmap->n;
1285: matrix->num_entries = a->nz;
1286: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1287: matrix->row_offsets->assign(ii, ii + m+1);
1289: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1290: matrix->column_indices->assign(a->j, a->j+a->nz);
1292: matrix->values = new THRUSTARRAY(a->nz);
1293: matrix->values->assign(a->a, a->a+a->nz);
1295: /* assign the pointer */
1296: matstruct->mat = matrix;
1298: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1299: #if CUDA_VERSION>=4020
1300: CsrMatrix *matrix= new CsrMatrix;
1301: matrix->num_rows = m;
1302: matrix->num_cols = A->cmap->n;
1303: matrix->num_entries = a->nz;
1304: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1305: matrix->row_offsets->assign(ii, ii + m+1);
1307: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1308: matrix->column_indices->assign(a->j, a->j+a->nz);
1310: matrix->values = new THRUSTARRAY(a->nz);
1311: matrix->values->assign(a->a, a->a+a->nz);
1313: cusparseHybMat_t hybMat;
1314: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1315: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1318: matstruct->descr, matrix->values->data().get(),
1319: matrix->row_offsets->data().get(),
1320: matrix->column_indices->data().get(),
1321: hybMat, 0, partition);CHKERRCUDA(stat);
1322: /* assign the pointer */
1323: matstruct->mat = hybMat;
1325: if (matrix) {
1326: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1327: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1328: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1329: delete (CsrMatrix*)matrix;
1330: }
1331: #endif
1332: }
1334: /* assign the compressed row indices */
1335: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1336: matstruct->cprowIndices->assign(ridx,ridx+m);
1338: /* assign the pointer */
1339: cusparsestruct->mat = matstruct;
1341: if (!a->compressedrow.use) {
1342: PetscFree(ii);
1343: PetscFree(ridx);
1344: }
1345: cusparsestruct->workVector = new THRUSTARRAY;
1346: cusparsestruct->workVector->resize(m);
1347: } catch(char *ex) {
1348: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1349: }
1350: WaitForGPU();CHKERRCUDA(ierr);
1352: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1354: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1355: }
1356: return(0);
1357: }
1361: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1362: {
1364: PetscInt rbs,cbs;
1367: MatGetBlockSizes(mat,&rbs,&cbs);
1368: if (right) {
1369: VecCreate(PetscObjectComm((PetscObject)mat),right);
1370: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1371: VecSetBlockSize(*right,cbs);
1372: VecSetType(*right,VECSEQCUDA);
1373: PetscLayoutReference(mat->cmap,&(*right)->map);
1374: }
1375: if (left) {
1376: VecCreate(PetscObjectComm((PetscObject)mat),left);
1377: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1378: VecSetBlockSize(*left,rbs);
1379: VecSetType(*left,VECSEQCUDA);
1380: PetscLayoutReference(mat->rmap,&(*left)->map);
1381: }
1382: return(0);
1383: }
1385: struct VecCUDAPlusEquals
1386: {
1387: template <typename Tuple>
1388: __host__ __device__
1389: void operator()(Tuple t)
1390: {
1391: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1392: }
1393: };
1397: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1398: {
1399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1400: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1401: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1402: PetscScalar *xarray,*yarray;
1403: PetscErrorCode ierr;
1404: cusparseStatus_t stat;
1407: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1408: MatSeqAIJCUSPARSECopyToGPU(A); */
1409: VecCUDAGetArrayRead(xx,&xarray);
1410: VecCUDAGetArrayWrite(yy,&yarray);
1411: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1413: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414: mat->num_rows, mat->num_cols, mat->num_entries,
1415: matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1416: mat->column_indices->data().get(), xarray, matstruct->beta,
1417: yarray);CHKERRCUDA(stat);
1418: } else {
1419: #if CUDA_VERSION>=4020
1420: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1421: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422: matstruct->alpha, matstruct->descr, hybMat,
1423: xarray, matstruct->beta,
1424: yarray);CHKERRCUDA(stat);
1425: #endif
1426: }
1427: VecCUDARestoreArrayRead(xx,&xarray);
1428: VecCUDARestoreArrayWrite(yy,&yarray);
1429: if (!cusparsestruct->stream) {
1430: WaitForGPU();CHKERRCUDA(ierr);
1431: }
1432: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1433: return(0);
1434: }
1438: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1439: {
1440: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1441: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1442: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1443: PetscScalar *xarray,*yarray;
1444: PetscErrorCode ierr;
1445: cusparseStatus_t stat;
1448: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1449: MatSeqAIJCUSPARSECopyToGPU(A); */
1450: if (!matstructT) {
1451: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1452: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1453: }
1454: VecCUDAGetArrayRead(xx,&xarray);
1455: VecCUDAGetArrayWrite(yy,&yarray);
1457: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1458: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1459: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1460: mat->num_rows, mat->num_cols,
1461: mat->num_entries, matstructT->alpha, matstructT->descr,
1462: mat->values->data().get(), mat->row_offsets->data().get(),
1463: mat->column_indices->data().get(), xarray, matstructT->beta,
1464: yarray);CHKERRCUDA(stat);
1465: } else {
1466: #if CUDA_VERSION>=4020
1467: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1468: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469: matstructT->alpha, matstructT->descr, hybMat,
1470: xarray, matstructT->beta,
1471: yarray);CHKERRCUDA(stat);
1472: #endif
1473: }
1474: VecCUDARestoreArrayRead(xx,&xarray);
1475: VecCUDARestoreArrayWrite(yy,&yarray);
1476: if (!cusparsestruct->stream) {
1477: WaitForGPU();CHKERRCUDA(ierr);
1478: }
1479: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1480: return(0);
1481: }
1486: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1487: {
1488: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1489: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1490: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1491: thrust::device_ptr<PetscScalar> zptr;
1492: PetscScalar *xarray,*zarray;
1493: PetscErrorCode ierr;
1494: cusparseStatus_t stat;
1497: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1498: MatSeqAIJCUSPARSECopyToGPU(A); */
1499: try {
1500: VecCopy_SeqCUDA(yy,zz);
1501: VecCUDAGetArrayRead(xx,&xarray);
1502: VecCUDAGetArrayWrite(zz,&zarray);
1503: zptr = thrust::device_pointer_cast(zarray);
1505: /* multiply add */
1506: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1507: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1508: /* here we need to be careful to set the number of rows in the multiply to the
1509: number of compressed rows in the matrix ... which is equivalent to the
1510: size of the workVector */
1511: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1512: mat->num_rows, mat->num_cols,
1513: mat->num_entries, matstruct->alpha, matstruct->descr,
1514: mat->values->data().get(), mat->row_offsets->data().get(),
1515: mat->column_indices->data().get(), xarray, matstruct->beta,
1516: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1517: } else {
1518: #if CUDA_VERSION>=4020
1519: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1520: if (cusparsestruct->workVector->size()) {
1521: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522: matstruct->alpha, matstruct->descr, hybMat,
1523: xarray, matstruct->beta,
1524: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1525: }
1526: #endif
1527: }
1529: /* scatter the data from the temporary into the full vector with a += operation */
1530: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1531: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1532: VecCUDAPlusEquals());
1533: VecCUDARestoreArrayRead(xx,&xarray);
1534: VecCUDARestoreArrayWrite(zz,&zarray);
1536: } catch(char *ex) {
1537: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1538: }
1539: WaitForGPU();CHKERRCUDA(ierr);
1540: PetscLogFlops(2.0*a->nz);
1541: return(0);
1542: }
1546: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1547: {
1548: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1549: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1550: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1551: thrust::device_ptr<PetscScalar> zptr;
1552: PetscScalar *xarray,*zarray;
1553: PetscErrorCode ierr;
1554: cusparseStatus_t stat;
1557: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1558: MatSeqAIJCUSPARSECopyToGPU(A); */
1559: if (!matstructT) {
1560: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1561: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1562: }
1564: try {
1565: VecCopy_SeqCUDA(yy,zz);
1566: VecCUDAGetArrayRead(xx,&xarray);
1567: VecCUDAGetArrayWrite(zz,&zarray);
1568: zptr = thrust::device_pointer_cast(zarray);
1570: /* multiply add with matrix transpose */
1571: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1572: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1573: /* here we need to be careful to set the number of rows in the multiply to the
1574: number of compressed rows in the matrix ... which is equivalent to the
1575: size of the workVector */
1576: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1577: mat->num_rows, mat->num_cols,
1578: mat->num_entries, matstructT->alpha, matstructT->descr,
1579: mat->values->data().get(), mat->row_offsets->data().get(),
1580: mat->column_indices->data().get(), xarray, matstructT->beta,
1581: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1582: } else {
1583: #if CUDA_VERSION>=4020
1584: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1585: if (cusparsestruct->workVector->size()) {
1586: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1587: matstructT->alpha, matstructT->descr, hybMat,
1588: xarray, matstructT->beta,
1589: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1590: }
1591: #endif
1592: }
1594: /* scatter the data from the temporary into the full vector with a += operation */
1595: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1596: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1597: VecCUDAPlusEquals());
1599: VecCUDARestoreArrayRead(xx,&xarray);
1600: VecCUDARestoreArrayWrite(zz,&zarray);
1602: } catch(char *ex) {
1603: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1604: }
1605: WaitForGPU();CHKERRCUDA(ierr);
1606: PetscLogFlops(2.0*a->nz);
1607: return(0);
1608: }
1612: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1613: {
1617: MatAssemblyEnd_SeqAIJ(A,mode);
1618: if (A->factortype==MAT_FACTOR_NONE) {
1619: MatSeqAIJCUSPARSECopyToGPU(A);
1620: }
1621: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1622: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1623: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1624: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1625: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1626: return(0);
1627: }
1629: /* --------------------------------------------------------------------------------*/
1632: /*@
1633: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1634: (the default parallel PETSc format). This matrix will ultimately pushed down
1635: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1636: assembly performance the user should preallocate the matrix storage by setting
1637: the parameter nz (or the array nnz). By setting these parameters accurately,
1638: performance during matrix assembly can be increased by more than a factor of 50.
1640: Collective on MPI_Comm
1642: Input Parameters:
1643: + comm - MPI communicator, set to PETSC_COMM_SELF
1644: . m - number of rows
1645: . n - number of columns
1646: . nz - number of nonzeros per row (same for all rows)
1647: - nnz - array containing the number of nonzeros in the various rows
1648: (possibly different for each row) or NULL
1650: Output Parameter:
1651: . A - the matrix
1653: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1654: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1655: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1657: Notes:
1658: If nnz is given then nz is ignored
1660: The AIJ format (also called the Yale sparse matrix format or
1661: compressed row storage), is fully compatible with standard Fortran 77
1662: storage. That is, the stored row and column indices can begin at
1663: either one (as in Fortran) or zero. See the users' manual for details.
1665: Specify the preallocated storage with either nz or nnz (not both).
1666: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1667: allocation. For large problems you MUST preallocate memory or you
1668: will get TERRIBLE performance, see the users' manual chapter on matrices.
1670: By default, this format uses inodes (identical nodes) when possible, to
1671: improve numerical efficiency of matrix-vector products and solves. We
1672: search for consecutive rows with the same nonzero structure, thereby
1673: reusing matrix information to achieve increased efficiency.
1675: Level: intermediate
1677: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1678: @*/
1679: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1680: {
1684: MatCreate(comm,A);
1685: MatSetSizes(*A,m,n,m,n);
1686: MatSetType(*A,MATSEQAIJCUSPARSE);
1687: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1688: return(0);
1689: }
1693: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1694: {
1695: PetscErrorCode ierr;
1698: if (A->factortype==MAT_FACTOR_NONE) {
1699: if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1700: Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1701: }
1702: } else {
1703: Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1704: }
1705: MatDestroy_SeqAIJ(A);
1706: return(0);
1707: }
1711: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1712: {
1714: cusparseStatus_t stat;
1715: cusparseHandle_t handle=0;
1718: MatCreate_SeqAIJ(B);
1719: if (B->factortype==MAT_FACTOR_NONE) {
1720: /* you cannot check the inode.use flag here since the matrix was just created.
1721: now build a GPU matrix data structure */
1722: B->spptr = new Mat_SeqAIJCUSPARSE;
1723: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1724: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1725: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1726: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1727: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1728: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0;
1729: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1730: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1731: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1732: } else {
1733: /* NEXT, set the pointers to the triangular factors */
1734: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1735: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1736: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1737: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1738: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1739: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1740: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1741: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1742: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0;
1743: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1744: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1745: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1746: }
1748: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1749: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1750: B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1751: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1752: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1753: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1754: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1755: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1757: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1759: B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1761: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1762: return(0);
1763: }
1765: /*M
1766: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1768: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1769: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1770: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1772: Options Database Keys:
1773: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1774: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1775: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1777: Level: beginner
1779: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1780: M*/
1782: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1787: PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1788: {
1792: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1793: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1794: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1795: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1796: return(0);
1797: }
1802: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1803: {
1804: cusparseStatus_t stat;
1805: cusparseHandle_t handle;
1808: if (*cusparsestruct) {
1809: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1810: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1811: delete (*cusparsestruct)->workVector;
1812: if (handle = (*cusparsestruct)->handle) {
1813: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1814: }
1815: delete *cusparsestruct;
1816: *cusparsestruct = 0;
1817: }
1818: return(0);
1819: }
1823: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1824: {
1826: if (*mat) {
1827: delete (*mat)->values;
1828: delete (*mat)->column_indices;
1829: delete (*mat)->row_offsets;
1830: delete *mat;
1831: *mat = 0;
1832: }
1833: return(0);
1834: }
1838: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1839: {
1840: cusparseStatus_t stat;
1841: PetscErrorCode ierr;
1844: if (*trifactor) {
1845: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1846: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1847: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1848: delete *trifactor;
1849: *trifactor = 0;
1850: }
1851: return(0);
1852: }
1856: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1857: {
1858: CsrMatrix *mat;
1859: cusparseStatus_t stat;
1860: cudaError_t err;
1863: if (*matstruct) {
1864: if ((*matstruct)->mat) {
1865: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1866: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1867: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1868: } else {
1869: mat = (CsrMatrix*)(*matstruct)->mat;
1870: CsrMatrix_Destroy(&mat);
1871: }
1872: }
1873: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1874: delete (*matstruct)->cprowIndices;
1875: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1876: if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1877: delete *matstruct;
1878: *matstruct = 0;
1879: }
1880: return(0);
1881: }
1885: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1886: {
1887: cusparseHandle_t handle;
1888: cusparseStatus_t stat;
1891: if (*trifactors) {
1892: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1893: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1894: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1895: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1896: delete (*trifactors)->rpermIndices;
1897: delete (*trifactors)->cpermIndices;
1898: delete (*trifactors)->workVector;
1899: if (handle = (*trifactors)->handle) {
1900: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1901: }
1902: delete *trifactors;
1903: *trifactors = 0;
1904: }
1905: return(0);
1906: }