Actual source code: aijcusparse.cu
petsc-3.6.1 2015-08-06
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
6: #include <petscconf.h>
7: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/vecimpl.h>
11: #undef VecType
12: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat);
29: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
35: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
36: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
37: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
38: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43: {
44: cusparseStatus_t stat;
45: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
48: cusparsestruct->stream = stream;
49: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
50: return(0);
51: }
55: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56: {
57: cusparseStatus_t stat;
58: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
61: if (cusparsestruct->handle)
62: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
63: cusparsestruct->handle = handle;
64: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
65: return(0);
66: }
70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71: {
72: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74: if (cusparsestruct->handle)
75: cusparsestruct->handle = 0;
76: return(0);
77: }
81: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
82: {
84: *type = MATSOLVERCUSPARSE;
85: return(0);
86: }
88: /*MC
89: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
90: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
91: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
92: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
93: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
94: algorithms are not recommended. This class does NOT support direct solver operations.
96: Level: beginner
98: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99: M*/
103: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104: {
106: PetscInt n = A->rmap->n;
109: MatCreate(PetscObjectComm((PetscObject)A),B);
110: (*B)->factortype = ftype;
111: MatSetSizes(*B,n,n,n,n);
112: MatSetType(*B,MATSEQAIJCUSPARSE);
114: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115: MatSetBlockSizesFromMats(*B,A,A);
116: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
123: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
124: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
125: return(0);
126: }
130: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131: {
132: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
135: #if CUDA_VERSION>=4020
136: switch (op) {
137: case MAT_CUSPARSE_MULT:
138: cusparsestruct->format = format;
139: break;
140: case MAT_CUSPARSE_ALL:
141: cusparsestruct->format = format;
142: break;
143: default:
144: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145: }
146: #else
147: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
148: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149: #endif
150: return(0);
151: }
153: /*@
154: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155: operation. Only the MatMult operation can use different GPU storage formats
156: for MPIAIJCUSPARSE matrices.
157: Not Collective
159: Input Parameters:
160: + A - Matrix of type SEQAIJCUSPARSE
161: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164: Output Parameter:
166: Level: intermediate
168: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169: @*/
172: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173: {
178: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
179: return(0);
180: }
184: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
185: {
186: PetscErrorCode ierr;
187: MatCUSPARSEStorageFormat format;
188: PetscBool flg;
189: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
192: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
193: PetscObjectOptionsBegin((PetscObject)A);
194: if (A->factortype==MAT_FACTOR_NONE) {
195: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
196: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
197: if (flg) {
198: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
199: }
200: }
201: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
202: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
203: if (flg) {
204: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
205: }
206: PetscOptionsEnd();
207: return(0);
209: }
213: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
214: {
218: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
219: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
220: return(0);
221: }
225: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
226: {
230: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
231: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
232: return(0);
233: }
237: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
238: {
242: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
243: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
244: return(0);
245: }
249: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
250: {
254: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
255: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
256: return(0);
257: }
261: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
264: PetscInt n = A->rmap->n;
265: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
266: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
267: cusparseStatus_t stat;
268: const PetscInt *ai = a->i,*aj = a->j,*vi;
269: const MatScalar *aa = a->a,*v;
270: PetscInt *AiLo, *AjLo;
271: PetscScalar *AALo;
272: PetscInt i,nz, nzLower, offset, rowOffset;
273: PetscErrorCode ierr;
276: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
277: try {
278: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279: nzLower=n+ai[n]-ai[1];
281: /* Allocate Space for the lower triangular matrix */
282: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
283: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
284: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
286: /* Fill the lower triangular matrix */
287: AiLo[0] = (PetscInt) 0;
288: AiLo[n] = nzLower;
289: AjLo[0] = (PetscInt) 0;
290: AALo[0] = (MatScalar) 1.0;
291: v = aa;
292: vi = aj;
293: offset = 1;
294: rowOffset= 1;
295: for (i=1; i<n; i++) {
296: nz = ai[i+1] - ai[i];
297: /* additional 1 for the term on the diagonal */
298: AiLo[i] = rowOffset;
299: rowOffset += nz+1;
301: PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
302: PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));
304: offset += nz;
305: AjLo[offset] = (PetscInt) i;
306: AALo[offset] = (MatScalar) 1.0;
307: offset += 1;
309: v += nz;
310: vi += nz;
311: }
313: /* allocate space for the triangular factor information */
314: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316: /* Create the matrix description */
317: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
318: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
319: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
320: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
321: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
323: /* Create the solve analysis information */
324: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
326: /* set the operation */
327: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329: /* set the matrix */
330: loTriFactor->csrMat = new CsrMatrix;
331: loTriFactor->csrMat->num_rows = n;
332: loTriFactor->csrMat->num_cols = n;
333: loTriFactor->csrMat->num_entries = nzLower;
335: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
336: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344: /* perform the solve analysis */
345: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
346: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
347: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
348: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
350: /* assign the pointer. Is this really necessary? */
351: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353: cudaFreeHost(AiLo);CHKERRCUSP(ierr);
354: cudaFreeHost(AjLo);CHKERRCUSP(ierr);
355: cudaFreeHost(AALo);CHKERRCUSP(ierr);
356: } catch(char *ex) {
357: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
358: }
359: }
360: return(0);
361: }
365: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
366: {
367: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
368: PetscInt n = A->rmap->n;
369: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
371: cusparseStatus_t stat;
372: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
373: const MatScalar *aa = a->a,*v;
374: PetscInt *AiUp, *AjUp;
375: PetscScalar *AAUp;
376: PetscInt i,nz, nzUpper, offset;
377: PetscErrorCode ierr;
380: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
381: try {
382: /* next, figure out the number of nonzeros in the upper triangular matrix. */
383: nzUpper = adiag[0]-adiag[n];
385: /* Allocate Space for the upper triangular matrix */
386: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
387: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
388: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
390: /* Fill the upper triangular matrix */
391: AiUp[0]=(PetscInt) 0;
392: AiUp[n]=nzUpper;
393: offset = nzUpper;
394: for (i=n-1; i>=0; i--) {
395: v = aa + adiag[i+1] + 1;
396: vi = aj + adiag[i+1] + 1;
398: /* number of elements NOT on the diagonal */
399: nz = adiag[i] - adiag[i+1]-1;
401: /* decrement the offset */
402: offset -= (nz+1);
404: /* first, set the diagonal elements */
405: AjUp[offset] = (PetscInt) i;
406: AAUp[offset] = 1./v[nz];
407: AiUp[i] = AiUp[i+1] - (nz+1);
409: PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
410: PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
411: }
413: /* allocate space for the triangular factor information */
414: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416: /* Create the matrix description */
417: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
418: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
419: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
420: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
421: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
423: /* Create the solve analysis information */
424: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
426: /* set the operation */
427: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429: /* set the matrix */
430: upTriFactor->csrMat = new CsrMatrix;
431: upTriFactor->csrMat->num_rows = n;
432: upTriFactor->csrMat->num_cols = n;
433: upTriFactor->csrMat->num_entries = nzUpper;
435: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
436: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
439: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444: /* perform the solve analysis */
445: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
446: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
447: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
448: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
450: /* assign the pointer. Is this really necessary? */
451: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453: cudaFreeHost(AiUp);CHKERRCUSP(ierr);
454: cudaFreeHost(AjUp);CHKERRCUSP(ierr);
455: cudaFreeHost(AAUp);CHKERRCUSP(ierr);
456: } catch(char *ex) {
457: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
458: }
459: }
460: return(0);
461: }
465: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
466: {
467: PetscErrorCode ierr;
468: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
469: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
470: IS isrow = a->row,iscol = a->icol;
471: PetscBool row_identity,col_identity;
472: const PetscInt *r,*c;
473: PetscInt n = A->rmap->n;
476: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
477: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
479: cusparseTriFactors->workVector = new THRUSTARRAY;
480: cusparseTriFactors->workVector->resize(n);
481: cusparseTriFactors->nnz=a->nz;
483: A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484: /*lower triangular indices */
485: ISGetIndices(isrow,&r);
486: ISIdentity(isrow,&row_identity);
487: if (!row_identity) {
488: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
489: cusparseTriFactors->rpermIndices->assign(r, r+n);
490: }
491: ISRestoreIndices(isrow,&r);
493: /*upper triangular indices */
494: ISGetIndices(iscol,&c);
495: ISIdentity(iscol,&col_identity);
496: if (!col_identity) {
497: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
498: cusparseTriFactors->cpermIndices->assign(c, c+n);
499: }
500: ISRestoreIndices(iscol,&c);
501: return(0);
502: }
506: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
507: {
508: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
509: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
510: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
511: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
512: cusparseStatus_t stat;
513: PetscErrorCode ierr;
514: PetscInt *AiUp, *AjUp;
515: PetscScalar *AAUp;
516: PetscScalar *AALo;
517: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
518: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
519: const PetscInt *ai = b->i,*aj = b->j,*vj;
520: const MatScalar *aa = b->a,*v;
523: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
524: try {
525: /* Allocate Space for the upper triangular matrix */
526: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
527: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
528: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
531: /* Fill the upper triangular matrix */
532: AiUp[0]=(PetscInt) 0;
533: AiUp[n]=nzUpper;
534: offset = 0;
535: for (i=0; i<n; i++) {
536: /* set the pointers */
537: v = aa + ai[i];
538: vj = aj + ai[i];
539: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541: /* first, set the diagonal elements */
542: AjUp[offset] = (PetscInt) i;
543: AAUp[offset] = 1.0/v[nz];
544: AiUp[i] = offset;
545: AALo[offset] = 1.0/v[nz];
547: offset+=1;
548: if (nz>0) {
549: PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
550: PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
551: for (j=offset; j<offset+nz; j++) {
552: AAUp[j] = -AAUp[j];
553: AALo[j] = AAUp[j]/v[nz];
554: }
555: offset+=nz;
556: }
557: }
559: /* allocate space for the triangular factor information */
560: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562: /* Create the matrix description */
563: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
564: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
565: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
566: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
567: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
569: /* Create the solve analysis information */
570: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
572: /* set the operation */
573: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575: /* set the matrix */
576: upTriFactor->csrMat = new CsrMatrix;
577: upTriFactor->csrMat->num_rows = A->rmap->n;
578: upTriFactor->csrMat->num_cols = A->cmap->n;
579: upTriFactor->csrMat->num_entries = a->nz;
581: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
582: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
585: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
588: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590: /* perform the solve analysis */
591: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
592: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
593: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
594: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
596: /* assign the pointer. Is this really necessary? */
597: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599: /* allocate space for the triangular factor information */
600: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602: /* Create the matrix description */
603: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
604: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
605: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
606: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
607: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
609: /* Create the solve analysis information */
610: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
612: /* set the operation */
613: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615: /* set the matrix */
616: loTriFactor->csrMat = new CsrMatrix;
617: loTriFactor->csrMat->num_rows = A->rmap->n;
618: loTriFactor->csrMat->num_cols = A->cmap->n;
619: loTriFactor->csrMat->num_entries = a->nz;
621: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
622: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
625: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
628: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630: /* perform the solve analysis */
631: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
632: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
633: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
634: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
636: /* assign the pointer. Is this really necessary? */
637: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639: A->valid_GPU_matrix = PETSC_CUSP_BOTH;
640: cudaFreeHost(AiUp);CHKERRCUSP(ierr);
641: cudaFreeHost(AjUp);CHKERRCUSP(ierr);
642: cudaFreeHost(AAUp);CHKERRCUSP(ierr);
643: cudaFreeHost(AALo);CHKERRCUSP(ierr);
644: } catch(char *ex) {
645: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
646: }
647: }
648: return(0);
649: }
653: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
654: {
655: PetscErrorCode ierr;
656: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
657: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
658: IS ip = a->row;
659: const PetscInt *rip;
660: PetscBool perm_identity;
661: PetscInt n = A->rmap->n;
664: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
665: cusparseTriFactors->workVector = new THRUSTARRAY;
666: cusparseTriFactors->workVector->resize(n);
667: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669: /*lower triangular indices */
670: ISGetIndices(ip,&rip);
671: ISIdentity(ip,&perm_identity);
672: if (!perm_identity) {
673: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
674: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
675: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676: cusparseTriFactors->cpermIndices->assign(rip, rip+n);
677: }
678: ISRestoreIndices(ip,&rip);
679: return(0);
680: }
684: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
685: {
686: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
687: IS isrow = b->row,iscol = b->col;
688: PetscBool row_identity,col_identity;
692: MatLUFactorNumeric_SeqAIJ(B,A,info);
693: /* determine which version of MatSolve needs to be used. */
694: ISIdentity(isrow,&row_identity);
695: ISIdentity(iscol,&col_identity);
696: if (row_identity && col_identity) {
697: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
698: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
699: } else {
700: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
701: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
702: }
704: /* get the triangular factors */
705: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
706: return(0);
707: }
711: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
712: {
713: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
714: IS ip = b->row;
715: PetscBool perm_identity;
719: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
721: /* determine which version of MatSolve needs to be used. */
722: ISIdentity(ip,&perm_identity);
723: if (perm_identity) {
724: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
725: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
726: } else {
727: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
728: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
729: }
731: /* get the triangular factors */
732: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
733: return(0);
734: }
738: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
739: {
740: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
741: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
742: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
743: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
744: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
745: cusparseStatus_t stat;
746: cusparseIndexBase_t indexBase;
747: cusparseMatrixType_t matrixType;
748: cusparseFillMode_t fillMode;
749: cusparseDiagType_t diagType;
753: /*********************************************/
754: /* Now the Transpose of the Lower Tri Factor */
755: /*********************************************/
757: /* allocate space for the transpose of the lower triangular factor */
758: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760: /* set the matrix descriptors of the lower triangular factor */
761: matrixType = cusparseGetMatType(loTriFactor->descr);
762: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
763: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
764: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
765: diagType = cusparseGetMatDiagType(loTriFactor->descr);
767: /* Create the matrix description */
768: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
769: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
770: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
771: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
772: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
774: /* Create the solve analysis information */
775: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
777: /* set the operation */
778: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780: /* allocate GPU space for the CSC of the lower triangular factor*/
781: loTriFactorT->csrMat = new CsrMatrix;
782: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
783: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
784: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
785: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
786: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
787: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789: /* compute the transpose of the lower triangular factor, i.e. the CSC */
790: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
791: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
792: loTriFactor->csrMat->values->data().get(),
793: loTriFactor->csrMat->row_offsets->data().get(),
794: loTriFactor->csrMat->column_indices->data().get(),
795: loTriFactorT->csrMat->values->data().get(),
796: loTriFactorT->csrMat->column_indices->data().get(),
797: loTriFactorT->csrMat->row_offsets->data().get(),
798: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
800: /* perform the solve analysis on the transposed matrix */
801: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
802: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
803: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
804: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
805: loTriFactorT->solveInfo);CHKERRCUSP(stat);
807: /* assign the pointer. Is this really necessary? */
808: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810: /*********************************************/
811: /* Now the Transpose of the Upper Tri Factor */
812: /*********************************************/
814: /* allocate space for the transpose of the upper triangular factor */
815: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817: /* set the matrix descriptors of the upper triangular factor */
818: matrixType = cusparseGetMatType(upTriFactor->descr);
819: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
820: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
821: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
822: diagType = cusparseGetMatDiagType(upTriFactor->descr);
824: /* Create the matrix description */
825: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
826: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
827: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
828: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
829: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
831: /* Create the solve analysis information */
832: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
834: /* set the operation */
835: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837: /* allocate GPU space for the CSC of the upper triangular factor*/
838: upTriFactorT->csrMat = new CsrMatrix;
839: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
840: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
841: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
842: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
843: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
844: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846: /* compute the transpose of the upper triangular factor, i.e. the CSC */
847: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
848: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
849: upTriFactor->csrMat->values->data().get(),
850: upTriFactor->csrMat->row_offsets->data().get(),
851: upTriFactor->csrMat->column_indices->data().get(),
852: upTriFactorT->csrMat->values->data().get(),
853: upTriFactorT->csrMat->column_indices->data().get(),
854: upTriFactorT->csrMat->row_offsets->data().get(),
855: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
857: /* perform the solve analysis on the transposed matrix */
858: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
859: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
860: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
861: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
862: upTriFactorT->solveInfo);CHKERRCUSP(stat);
864: /* assign the pointer. Is this really necessary? */
865: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
866: return(0);
867: }
871: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
872: {
873: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
874: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
875: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
876: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
877: cusparseStatus_t stat;
878: cusparseIndexBase_t indexBase;
879: cudaError_t err;
883: /* allocate space for the triangular factor information */
884: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
885: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
886: indexBase = cusparseGetMatIndexBase(matstruct->descr);
887: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
888: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
890: /* set alpha and beta */
891: err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
892: err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
893: err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
894: err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
895: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
897: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
898: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
899: CsrMatrix *matrixT= new CsrMatrix;
900: matrixT->num_rows = A->rmap->n;
901: matrixT->num_cols = A->cmap->n;
902: matrixT->num_entries = a->nz;
903: matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
905: matrixT->values = new THRUSTARRAY(a->nz);
907: /* compute the transpose of the upper triangular factor, i.e. the CSC */
908: indexBase = cusparseGetMatIndexBase(matstruct->descr);
909: stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
910: matrix->num_cols, matrix->num_entries,
911: matrix->values->data().get(),
912: matrix->row_offsets->data().get(),
913: matrix->column_indices->data().get(),
914: matrixT->values->data().get(),
915: matrixT->column_indices->data().get(),
916: matrixT->row_offsets->data().get(),
917: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
919: /* assign the pointer */
920: matstructT->mat = matrixT;
922: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
923: #if CUDA_VERSION>=5000
924: /* First convert HYB to CSR */
925: CsrMatrix *temp= new CsrMatrix;
926: temp->num_rows = A->rmap->n;
927: temp->num_cols = A->cmap->n;
928: temp->num_entries = a->nz;
929: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
930: temp->column_indices = new THRUSTINTARRAY32(a->nz);
931: temp->values = new THRUSTARRAY(a->nz);
934: stat = cusparse_hyb2csr(cusparsestruct->handle,
935: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
936: temp->values->data().get(),
937: temp->row_offsets->data().get(),
938: temp->column_indices->data().get());CHKERRCUSP(stat);
940: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
941: CsrMatrix *tempT= new CsrMatrix;
942: tempT->num_rows = A->rmap->n;
943: tempT->num_cols = A->cmap->n;
944: tempT->num_entries = a->nz;
945: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
946: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
947: tempT->values = new THRUSTARRAY(a->nz);
949: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
950: temp->num_cols, temp->num_entries,
951: temp->values->data().get(),
952: temp->row_offsets->data().get(),
953: temp->column_indices->data().get(),
954: tempT->values->data().get(),
955: tempT->column_indices->data().get(),
956: tempT->row_offsets->data().get(),
957: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
959: /* Last, convert CSC to HYB */
960: cusparseHybMat_t hybMat;
961: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
962: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
963: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
964: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
965: matstructT->descr, tempT->values->data().get(),
966: tempT->row_offsets->data().get(),
967: tempT->column_indices->data().get(),
968: hybMat, 0, partition);CHKERRCUSP(stat);
970: /* assign the pointer */
971: matstructT->mat = hybMat;
973: /* delete temporaries */
974: if (tempT) {
975: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
976: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
977: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
978: delete (CsrMatrix*) tempT;
979: }
980: if (temp) {
981: if (temp->values) delete (THRUSTARRAY*) temp->values;
982: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
983: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
984: delete (CsrMatrix*) temp;
985: }
986: #else
987: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
988: #endif
989: }
990: /* assign the compressed row indices */
991: matstructT->cprowIndices = new THRUSTINTARRAY;
993: /* assign the pointer */
994: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995: return(0);
996: }
1000: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001: {
1002: CUSPARRAY *xGPU, *bGPU;
1003: cusparseStatus_t stat;
1004: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1006: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1007: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1008: PetscErrorCode ierr;
1011: /* Analyze the matrix and create the transpose ... on the fly */
1012: if (!loTriFactorT && !upTriFactorT) {
1013: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1014: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016: }
1018: /* Get the GPU pointers */
1019: VecCUSPGetArrayWrite(xx,&xGPU);
1020: VecCUSPGetArrayRead(bb,&bGPU);
1022: /* First, reorder with the row permutation */
1023: thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1024: thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1025: xGPU->begin());
1027: /* First, solve U */
1028: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1030: upTriFactorT->csrMat->values->data().get(),
1031: upTriFactorT->csrMat->row_offsets->data().get(),
1032: upTriFactorT->csrMat->column_indices->data().get(),
1033: upTriFactorT->solveInfo,
1034: xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1036: /* Then, solve L */
1037: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1039: loTriFactorT->csrMat->values->data().get(),
1040: loTriFactorT->csrMat->row_offsets->data().get(),
1041: loTriFactorT->csrMat->column_indices->data().get(),
1042: loTriFactorT->solveInfo,
1043: tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1045: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046: thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1047: thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1048: tempGPU->begin());
1050: /* Copy the temporary to the full solution. */
1051: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1053: /* restore */
1054: VecCUSPRestoreArrayRead(bb,&bGPU);
1055: VecCUSPRestoreArrayWrite(xx,&xGPU);
1056: WaitForGPU();CHKERRCUSP(ierr);
1058: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1059: return(0);
1060: }
1064: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1065: {
1066: CUSPARRAY *xGPU,*bGPU;
1067: cusparseStatus_t stat;
1068: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1069: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1070: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1071: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1072: PetscErrorCode ierr;
1075: /* Analyze the matrix and create the transpose ... on the fly */
1076: if (!loTriFactorT && !upTriFactorT) {
1077: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1078: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080: }
1082: /* Get the GPU pointers */
1083: VecCUSPGetArrayWrite(xx,&xGPU);
1084: VecCUSPGetArrayRead(bb,&bGPU);
1086: /* First, solve U */
1087: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1089: upTriFactorT->csrMat->values->data().get(),
1090: upTriFactorT->csrMat->row_offsets->data().get(),
1091: upTriFactorT->csrMat->column_indices->data().get(),
1092: upTriFactorT->solveInfo,
1093: bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1095: /* Then, solve L */
1096: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1098: loTriFactorT->csrMat->values->data().get(),
1099: loTriFactorT->csrMat->row_offsets->data().get(),
1100: loTriFactorT->csrMat->column_indices->data().get(),
1101: loTriFactorT->solveInfo,
1102: tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1104: /* restore */
1105: VecCUSPRestoreArrayRead(bb,&bGPU);
1106: VecCUSPRestoreArrayWrite(xx,&xGPU);
1107: WaitForGPU();CHKERRCUSP(ierr);
1108: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1109: return(0);
1110: }
1114: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115: {
1116: CUSPARRAY *xGPU,*bGPU;
1117: cusparseStatus_t stat;
1118: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1119: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1120: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1121: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1122: PetscErrorCode ierr;
1123: VecType t;
1124: PetscBool flg;
1127: VecGetType(bb,&t);
1128: PetscStrcmp(t,VECSEQCUSP,&flg);
1129: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1130: VecGetType(xx,&t);
1131: PetscStrcmp(t,VECSEQCUSP,&flg);
1132: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);
1134: /* Get the GPU pointers */
1135: VecCUSPGetArrayWrite(xx,&xGPU);
1136: VecCUSPGetArrayRead(bb,&bGPU);
1138: /* First, reorder with the row permutation */
1139: thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1140: thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1141: xGPU->begin());
1143: /* Next, solve L */
1144: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1145: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1146: loTriFactor->csrMat->values->data().get(),
1147: loTriFactor->csrMat->row_offsets->data().get(),
1148: loTriFactor->csrMat->column_indices->data().get(),
1149: loTriFactor->solveInfo,
1150: xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1152: /* Then, solve U */
1153: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1154: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1155: upTriFactor->csrMat->values->data().get(),
1156: upTriFactor->csrMat->row_offsets->data().get(),
1157: upTriFactor->csrMat->column_indices->data().get(),
1158: upTriFactor->solveInfo,
1159: tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1161: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1162: thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1163: thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1164: tempGPU->begin());
1166: /* Copy the temporary to the full solution. */
1167: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1169: VecCUSPRestoreArrayRead(bb,&bGPU);
1170: VecCUSPRestoreArrayWrite(xx,&xGPU);
1171: WaitForGPU();CHKERRCUSP(ierr);
1172: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1173: return(0);
1174: }
1178: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1179: {
1180: CUSPARRAY *xGPU,*bGPU;
1181: cusparseStatus_t stat;
1182: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1183: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1184: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1185: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1186: PetscErrorCode ierr;
1189: /* Get the GPU pointers */
1190: VecCUSPGetArrayWrite(xx,&xGPU);
1191: VecCUSPGetArrayRead(bb,&bGPU);
1193: /* First, solve L */
1194: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1195: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1196: loTriFactor->csrMat->values->data().get(),
1197: loTriFactor->csrMat->row_offsets->data().get(),
1198: loTriFactor->csrMat->column_indices->data().get(),
1199: loTriFactor->solveInfo,
1200: bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1202: /* Next, solve U */
1203: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1204: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1205: upTriFactor->csrMat->values->data().get(),
1206: upTriFactor->csrMat->row_offsets->data().get(),
1207: upTriFactor->csrMat->column_indices->data().get(),
1208: upTriFactor->solveInfo,
1209: tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1211: VecCUSPRestoreArrayRead(bb,&bGPU);
1212: VecCUSPRestoreArrayWrite(xx,&xGPU);
1213: WaitForGPU();CHKERRCUSP(ierr);
1214: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1215: return(0);
1216: }
1220: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1221: {
1223: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1224: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1226: PetscInt m = A->rmap->n,*ii,*ridx;
1227: PetscErrorCode ierr;
1228: cusparseStatus_t stat;
1229: cudaError_t err;
1232: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1233: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1234: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1235: try {
1236: cusparsestruct->nonzerorow=0;
1237: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1239: if (a->compressedrow.use) {
1240: m = a->compressedrow.nrows;
1241: ii = a->compressedrow.i;
1242: ridx = a->compressedrow.rindex;
1243: } else {
1244: /* Forcing compressed row on the GPU */
1245: int k=0;
1246: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1247: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1248: ii[0]=0;
1249: for (int j = 0; j<m; j++) {
1250: if ((a->i[j+1]-a->i[j])>0) {
1251: ii[k] = a->i[j];
1252: ridx[k]= j;
1253: k++;
1254: }
1255: }
1256: ii[cusparsestruct->nonzerorow] = a->nz;
1257: m = cusparsestruct->nonzerorow;
1258: }
1260: /* allocate space for the triangular factor information */
1261: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1262: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1263: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1264: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1266: err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1267: err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1268: err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1269: err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1270: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1272: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1273: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1274: /* set the matrix */
1275: CsrMatrix *matrix= new CsrMatrix;
1276: matrix->num_rows = m;
1277: matrix->num_cols = A->cmap->n;
1278: matrix->num_entries = a->nz;
1279: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280: matrix->row_offsets->assign(ii, ii + m+1);
1282: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283: matrix->column_indices->assign(a->j, a->j+a->nz);
1285: matrix->values = new THRUSTARRAY(a->nz);
1286: matrix->values->assign(a->a, a->a+a->nz);
1288: /* assign the pointer */
1289: matstruct->mat = matrix;
1291: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1292: #if CUDA_VERSION>=4020
1293: CsrMatrix *matrix= new CsrMatrix;
1294: matrix->num_rows = m;
1295: matrix->num_cols = A->cmap->n;
1296: matrix->num_entries = a->nz;
1297: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1298: matrix->row_offsets->assign(ii, ii + m+1);
1300: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1301: matrix->column_indices->assign(a->j, a->j+a->nz);
1303: matrix->values = new THRUSTARRAY(a->nz);
1304: matrix->values->assign(a->a, a->a+a->nz);
1306: cusparseHybMat_t hybMat;
1307: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1308: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1311: matstruct->descr, matrix->values->data().get(),
1312: matrix->row_offsets->data().get(),
1313: matrix->column_indices->data().get(),
1314: hybMat, 0, partition);CHKERRCUSP(stat);
1315: /* assign the pointer */
1316: matstruct->mat = hybMat;
1318: if (matrix) {
1319: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1320: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1321: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1322: delete (CsrMatrix*)matrix;
1323: }
1324: #endif
1325: }
1327: /* assign the compressed row indices */
1328: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1329: matstruct->cprowIndices->assign(ridx,ridx+m);
1331: /* assign the pointer */
1332: cusparsestruct->mat = matstruct;
1334: if (!a->compressedrow.use) {
1335: PetscFree(ii);
1336: PetscFree(ridx);
1337: }
1338: cusparsestruct->workVector = new THRUSTARRAY;
1339: cusparsestruct->workVector->resize(m);
1340: } catch(char *ex) {
1341: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1342: }
1343: WaitForGPU();CHKERRCUSP(ierr);
1345: A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1347: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1348: }
1349: return(0);
1350: }
1354: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1355: {
1357: PetscInt rbs,cbs;
1360: MatGetBlockSizes(mat,&rbs,&cbs);
1361: if (right) {
1362: VecCreate(PetscObjectComm((PetscObject)mat),right);
1363: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1364: VecSetBlockSize(*right,cbs);
1365: VecSetType(*right,VECSEQCUSP);
1366: PetscLayoutReference(mat->cmap,&(*right)->map);
1367: }
1368: if (left) {
1369: VecCreate(PetscObjectComm((PetscObject)mat),left);
1370: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1371: VecSetBlockSize(*left,rbs);
1372: VecSetType(*left,VECSEQCUSP);
1373: PetscLayoutReference(mat->rmap,&(*left)->map);
1374: }
1375: return(0);
1376: }
1378: struct VecCUSPPlusEquals
1379: {
1380: template <typename Tuple>
1381: __host__ __device__
1382: void operator()(Tuple t)
1383: {
1384: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1385: }
1386: };
1390: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1391: {
1392: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1393: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1394: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1395: CUSPARRAY *xarray,*yarray;
1396: PetscErrorCode ierr;
1397: cusparseStatus_t stat;
1400: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1401: MatSeqAIJCUSPARSECopyToGPU(A); */
1402: VecCUSPGetArrayRead(xx,&xarray);
1403: VecCUSPGetArrayWrite(yy,&yarray);
1404: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1405: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1406: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407: mat->num_rows, mat->num_cols, mat->num_entries,
1408: matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1409: mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1410: yarray->data().get());CHKERRCUSP(stat);
1411: } else {
1412: #if CUDA_VERSION>=4020
1413: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1414: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415: matstruct->alpha, matstruct->descr, hybMat,
1416: xarray->data().get(), matstruct->beta,
1417: yarray->data().get());CHKERRCUSP(stat);
1418: #endif
1419: }
1420: VecCUSPRestoreArrayRead(xx,&xarray);
1421: VecCUSPRestoreArrayWrite(yy,&yarray);
1422: if (!cusparsestruct->stream) {
1423: WaitForGPU();CHKERRCUSP(ierr);
1424: }
1425: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1426: return(0);
1427: }
1431: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1432: {
1433: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1434: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1435: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436: CUSPARRAY *xarray,*yarray;
1437: PetscErrorCode ierr;
1438: cusparseStatus_t stat;
1441: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1442: MatSeqAIJCUSPARSECopyToGPU(A); */
1443: if (!matstructT) {
1444: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1445: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1446: }
1447: VecCUSPGetArrayRead(xx,&xarray);
1448: VecCUSPGetArrayWrite(yy,&yarray);
1450: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1451: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1452: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1453: mat->num_rows, mat->num_cols,
1454: mat->num_entries, matstructT->alpha, matstructT->descr,
1455: mat->values->data().get(), mat->row_offsets->data().get(),
1456: mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1457: yarray->data().get());CHKERRCUSP(stat);
1458: } else {
1459: #if CUDA_VERSION>=4020
1460: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1461: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1462: matstructT->alpha, matstructT->descr, hybMat,
1463: xarray->data().get(), matstructT->beta,
1464: yarray->data().get());CHKERRCUSP(stat);
1465: #endif
1466: }
1467: VecCUSPRestoreArrayRead(xx,&xarray);
1468: VecCUSPRestoreArrayWrite(yy,&yarray);
1469: if (!cusparsestruct->stream) {
1470: WaitForGPU();CHKERRCUSP(ierr);
1471: }
1472: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1473: return(0);
1474: }
1479: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1480: {
1481: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1482: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1483: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1484: CUSPARRAY *xarray,*yarray,*zarray;
1485: PetscErrorCode ierr;
1486: cusparseStatus_t stat;
1489: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1490: MatSeqAIJCUSPARSECopyToGPU(A); */
1491: try {
1492: VecCopy_SeqCUSP(yy,zz);
1493: VecCUSPGetArrayRead(xx,&xarray);
1494: VecCUSPGetArrayRead(yy,&yarray);
1495: VecCUSPGetArrayWrite(zz,&zarray);
1497: /* multiply add */
1498: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1499: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1500: /* here we need to be careful to set the number of rows in the multiply to the
1501: number of compressed rows in the matrix ... which is equivalent to the
1502: size of the workVector */
1503: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504: mat->num_rows, mat->num_cols,
1505: mat->num_entries, matstruct->alpha, matstruct->descr,
1506: mat->values->data().get(), mat->row_offsets->data().get(),
1507: mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1508: cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1509: } else {
1510: #if CUDA_VERSION>=4020
1511: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1512: if (cusparsestruct->workVector->size()) {
1513: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1514: matstruct->alpha, matstruct->descr, hybMat,
1515: xarray->data().get(), matstruct->beta,
1516: cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1517: }
1518: #endif
1519: }
1521: /* scatter the data from the temporary into the full vector with a += operation */
1522: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1523: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1524: VecCUSPPlusEquals());
1525: VecCUSPRestoreArrayRead(xx,&xarray);
1526: VecCUSPRestoreArrayRead(yy,&yarray);
1527: VecCUSPRestoreArrayWrite(zz,&zarray);
1529: } catch(char *ex) {
1530: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1531: }
1532: WaitForGPU();CHKERRCUSP(ierr);
1533: PetscLogFlops(2.0*a->nz);
1534: return(0);
1535: }
1539: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1540: {
1541: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1542: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1543: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544: CUSPARRAY *xarray,*yarray,*zarray;
1545: PetscErrorCode ierr;
1546: cusparseStatus_t stat;
1549: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1550: MatSeqAIJCUSPARSECopyToGPU(A); */
1551: if (!matstructT) {
1552: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1553: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1554: }
1556: try {
1557: VecCopy_SeqCUSP(yy,zz);
1558: VecCUSPGetArrayRead(xx,&xarray);
1559: VecCUSPGetArrayRead(yy,&yarray);
1560: VecCUSPGetArrayWrite(zz,&zarray);
1562: /* multiply add with matrix transpose */
1563: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1564: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1565: /* here we need to be careful to set the number of rows in the multiply to the
1566: number of compressed rows in the matrix ... which is equivalent to the
1567: size of the workVector */
1568: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1569: mat->num_rows, mat->num_cols,
1570: mat->num_entries, matstructT->alpha, matstructT->descr,
1571: mat->values->data().get(), mat->row_offsets->data().get(),
1572: mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1573: cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1574: } else {
1575: #if CUDA_VERSION>=4020
1576: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1577: if (cusparsestruct->workVector->size()) {
1578: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1579: matstructT->alpha, matstructT->descr, hybMat,
1580: xarray->data().get(), matstructT->beta,
1581: cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1582: }
1583: #endif
1584: }
1586: /* scatter the data from the temporary into the full vector with a += operation */
1587: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1588: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1589: VecCUSPPlusEquals());
1591: VecCUSPRestoreArrayRead(xx,&xarray);
1592: VecCUSPRestoreArrayRead(yy,&yarray);
1593: VecCUSPRestoreArrayWrite(zz,&zarray);
1595: } catch(char *ex) {
1596: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1597: }
1598: WaitForGPU();CHKERRCUSP(ierr);
1599: PetscLogFlops(2.0*a->nz);
1600: return(0);
1601: }
1605: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1606: {
1610: MatAssemblyEnd_SeqAIJ(A,mode);
1611: if (A->factortype==MAT_FACTOR_NONE) {
1612: MatSeqAIJCUSPARSECopyToGPU(A);
1613: }
1614: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1615: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1616: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1617: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1618: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1619: return(0);
1620: }
1622: /* --------------------------------------------------------------------------------*/
1625: /*@
1626: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1627: (the default parallel PETSc format). This matrix will ultimately pushed down
1628: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1629: assembly performance the user should preallocate the matrix storage by setting
1630: the parameter nz (or the array nnz). By setting these parameters accurately,
1631: performance during matrix assembly can be increased by more than a factor of 50.
1633: Collective on MPI_Comm
1635: Input Parameters:
1636: + comm - MPI communicator, set to PETSC_COMM_SELF
1637: . m - number of rows
1638: . n - number of columns
1639: . nz - number of nonzeros per row (same for all rows)
1640: - nnz - array containing the number of nonzeros in the various rows
1641: (possibly different for each row) or NULL
1643: Output Parameter:
1644: . A - the matrix
1646: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1647: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1650: Notes:
1651: If nnz is given then nz is ignored
1653: The AIJ format (also called the Yale sparse matrix format or
1654: compressed row storage), is fully compatible with standard Fortran 77
1655: storage. That is, the stored row and column indices can begin at
1656: either one (as in Fortran) or zero. See the users' manual for details.
1658: Specify the preallocated storage with either nz or nnz (not both).
1659: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1660: allocation. For large problems you MUST preallocate memory or you
1661: will get TERRIBLE performance, see the users' manual chapter on matrices.
1663: By default, this format uses inodes (identical nodes) when possible, to
1664: improve numerical efficiency of matrix-vector products and solves. We
1665: search for consecutive rows with the same nonzero structure, thereby
1666: reusing matrix information to achieve increased efficiency.
1668: Level: intermediate
1670: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1671: @*/
1672: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1673: {
1677: MatCreate(comm,A);
1678: MatSetSizes(*A,m,n,m,n);
1679: MatSetType(*A,MATSEQAIJCUSPARSE);
1680: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1681: return(0);
1682: }
1686: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1687: {
1688: PetscErrorCode ierr;
1691: if (A->factortype==MAT_FACTOR_NONE) {
1692: if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1693: Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1694: }
1695: } else {
1696: Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1697: }
1698: MatDestroy_SeqAIJ(A);
1699: return(0);
1700: }
1704: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1705: {
1707: cusparseStatus_t stat;
1708: cusparseHandle_t handle=0;
1711: MatCreate_SeqAIJ(B);
1712: if (B->factortype==MAT_FACTOR_NONE) {
1713: /* you cannot check the inode.use flag here since the matrix was just created.
1714: now build a GPU matrix data structure */
1715: B->spptr = new Mat_SeqAIJCUSPARSE;
1716: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1717: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1718: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1719: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1720: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1721: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0;
1722: stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1723: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1724: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1725: } else {
1726: /* NEXT, set the pointers to the triangular factors */
1727: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1728: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1729: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1730: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1731: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1732: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1733: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1734: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1735: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0;
1736: stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1737: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1738: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1739: }
1741: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1742: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1743: B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1744: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1745: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1746: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1747: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1748: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1750: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1752: B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1754: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1755: return(0);
1756: }
1758: /*M
1759: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1761: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1762: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1763: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1765: Options Database Keys:
1766: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1767: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1768: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1770: Level: beginner
1772: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1773: M*/
1775: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1780: PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1781: {
1785: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1786: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1787: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1788: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1789: return(0);
1790: }
1795: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1796: {
1797: cusparseStatus_t stat;
1798: cusparseHandle_t handle;
1801: if (*cusparsestruct) {
1802: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1803: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1804: delete (*cusparsestruct)->workVector;
1805: if (handle = (*cusparsestruct)->handle) {
1806: stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1807: }
1808: delete *cusparsestruct;
1809: *cusparsestruct = 0;
1810: }
1811: return(0);
1812: }
1816: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1817: {
1819: if (*mat) {
1820: delete (*mat)->values;
1821: delete (*mat)->column_indices;
1822: delete (*mat)->row_offsets;
1823: delete *mat;
1824: *mat = 0;
1825: }
1826: return(0);
1827: }
1831: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1832: {
1833: cusparseStatus_t stat;
1834: PetscErrorCode ierr;
1837: if (*trifactor) {
1838: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1839: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1840: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1841: delete *trifactor;
1842: *trifactor = 0;
1843: }
1844: return(0);
1845: }
1849: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1850: {
1851: CsrMatrix *mat;
1852: cusparseStatus_t stat;
1853: cudaError_t err;
1856: if (*matstruct) {
1857: if ((*matstruct)->mat) {
1858: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1859: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1860: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1861: } else {
1862: mat = (CsrMatrix*)(*matstruct)->mat;
1863: CsrMatrix_Destroy(&mat);
1864: }
1865: }
1866: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1867: delete (*matstruct)->cprowIndices;
1868: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1869: if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1870: delete *matstruct;
1871: *matstruct = 0;
1872: }
1873: return(0);
1874: }
1878: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1879: {
1880: cusparseHandle_t handle;
1881: cusparseStatus_t stat;
1884: if (*trifactors) {
1885: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1886: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1887: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1888: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1889: delete (*trifactors)->rpermIndices;
1890: delete (*trifactors)->cpermIndices;
1891: delete (*trifactors)->workVector;
1892: if (handle = (*trifactors)->handle) {
1893: stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1894: }
1895: delete *trifactors;
1896: *trifactors = 0;
1897: }
1898: return(0);
1899: }