Actual source code: aijcusparse.cu
petsc-3.8.4 2018-03-24
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42: {
43: cusparseStatus_t stat;
44: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47: cusparsestruct->stream = stream;
48: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49: return(0);
50: }
52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53: {
54: cusparseStatus_t stat;
55: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58: if (cusparsestruct->handle != handle) {
59: if (cusparsestruct->handle) {
60: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61: }
62: cusparsestruct->handle = handle;
63: }
64: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65: return(0);
66: }
68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69: {
70: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72: if (cusparsestruct->handle)
73: cusparsestruct->handle = 0;
74: return(0);
75: }
77: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
78: {
80: *type = MATSOLVERCUSPARSE;
81: return(0);
82: }
84: /*MC
85: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90: algorithms are not recommended. This class does NOT support direct solver operations.
92: Level: beginner
94: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95: M*/
97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98: {
100: PetscInt n = A->rmap->n;
103: MatCreate(PetscObjectComm((PetscObject)A),B);
104: (*B)->factortype = ftype;
105: MatSetSizes(*B,n,n,n,n);
106: MatSetType(*B,MATSEQAIJCUSPARSE);
108: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109: MatSetBlockSizesFromMats(*B,A,A);
110: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127: #if CUDA_VERSION>=4020
128: switch (op) {
129: case MAT_CUSPARSE_MULT:
130: cusparsestruct->format = format;
131: break;
132: case MAT_CUSPARSE_ALL:
133: cusparsestruct->format = format;
134: break;
135: default:
136: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137: }
138: #else
139: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141: return(0);
142: }
144: /*@
145: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146: operation. Only the MatMult operation can use different GPU storage formats
147: for MPIAIJCUSPARSE matrices.
148: Not Collective
150: Input Parameters:
151: + A - Matrix of type SEQAIJCUSPARSE
152: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
155: Output Parameter:
157: Level: intermediate
159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {
167: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168: return(0);
169: }
171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173: PetscErrorCode ierr;
174: MatCUSPARSEStorageFormat format;
175: PetscBool flg;
176: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
179: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180: PetscObjectOptionsBegin((PetscObject)A);
181: if (A->factortype==MAT_FACTOR_NONE) {
182: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
184: if (flg) {
185: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
186: }
187: }
188: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
190: if (flg) {
191: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
192: }
193: PetscOptionsEnd();
194: return(0);
196: }
198: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199: {
203: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
204: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205: return(0);
206: }
208: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209: {
213: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
214: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215: return(0);
216: }
218: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219: {
223: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
224: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225: return(0);
226: }
228: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229: {
233: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
234: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235: return(0);
236: }
238: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239: {
240: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
241: PetscInt n = A->rmap->n;
242: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244: cusparseStatus_t stat;
245: const PetscInt *ai = a->i,*aj = a->j,*vi;
246: const MatScalar *aa = a->a,*v;
247: PetscInt *AiLo, *AjLo;
248: PetscScalar *AALo;
249: PetscInt i,nz, nzLower, offset, rowOffset;
250: PetscErrorCode ierr;
253: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
254: try {
255: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256: nzLower=n+ai[n]-ai[1];
258: /* Allocate Space for the lower triangular matrix */
259: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
263: /* Fill the lower triangular matrix */
264: AiLo[0] = (PetscInt) 0;
265: AiLo[n] = nzLower;
266: AjLo[0] = (PetscInt) 0;
267: AALo[0] = (MatScalar) 1.0;
268: v = aa;
269: vi = aj;
270: offset = 1;
271: rowOffset= 1;
272: for (i=1; i<n; i++) {
273: nz = ai[i+1] - ai[i];
274: /* additional 1 for the term on the diagonal */
275: AiLo[i] = rowOffset;
276: rowOffset += nz+1;
278: PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
279: PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));
281: offset += nz;
282: AjLo[offset] = (PetscInt) i;
283: AALo[offset] = (MatScalar) 1.0;
284: offset += 1;
286: v += nz;
287: vi += nz;
288: }
290: /* allocate space for the triangular factor information */
291: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
293: /* Create the matrix description */
294: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
300: /* Create the solve analysis information */
301: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
303: /* set the operation */
304: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
306: /* set the matrix */
307: loTriFactor->csrMat = new CsrMatrix;
308: loTriFactor->csrMat->num_rows = n;
309: loTriFactor->csrMat->num_cols = n;
310: loTriFactor->csrMat->num_entries = nzLower;
312: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
315: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
318: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
321: /* perform the solve analysis */
322: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
327: /* assign the pointer. Is this really necessary? */
328: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
330: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332: cudaFreeHost(AALo);CHKERRCUDA(ierr);
333: } catch(char *ex) {
334: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335: }
336: }
337: return(0);
338: }
340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
343: PetscInt n = A->rmap->n;
344: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346: cusparseStatus_t stat;
347: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
348: const MatScalar *aa = a->a,*v;
349: PetscInt *AiUp, *AjUp;
350: PetscScalar *AAUp;
351: PetscInt i,nz, nzUpper, offset;
352: PetscErrorCode ierr;
355: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
356: try {
357: /* next, figure out the number of nonzeros in the upper triangular matrix. */
358: nzUpper = adiag[0]-adiag[n];
360: /* Allocate Space for the upper triangular matrix */
361: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
365: /* Fill the upper triangular matrix */
366: AiUp[0]=(PetscInt) 0;
367: AiUp[n]=nzUpper;
368: offset = nzUpper;
369: for (i=n-1; i>=0; i--) {
370: v = aa + adiag[i+1] + 1;
371: vi = aj + adiag[i+1] + 1;
373: /* number of elements NOT on the diagonal */
374: nz = adiag[i] - adiag[i+1]-1;
376: /* decrement the offset */
377: offset -= (nz+1);
379: /* first, set the diagonal elements */
380: AjUp[offset] = (PetscInt) i;
381: AAUp[offset] = (MatScalar)1./v[nz];
382: AiUp[i] = AiUp[i+1] - (nz+1);
384: PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
385: PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
386: }
388: /* allocate space for the triangular factor information */
389: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391: /* Create the matrix description */
392: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
398: /* Create the solve analysis information */
399: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
401: /* set the operation */
402: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
404: /* set the matrix */
405: upTriFactor->csrMat = new CsrMatrix;
406: upTriFactor->csrMat->num_rows = n;
407: upTriFactor->csrMat->num_cols = n;
408: upTriFactor->csrMat->num_entries = nzUpper;
410: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
413: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
416: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
419: /* perform the solve analysis */
420: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
425: /* assign the pointer. Is this really necessary? */
426: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
428: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431: } catch(char *ex) {
432: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433: }
434: }
435: return(0);
436: }
438: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439: {
440: PetscErrorCode ierr;
441: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
442: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443: IS isrow = a->row,iscol = a->icol;
444: PetscBool row_identity,col_identity;
445: const PetscInt *r,*c;
446: PetscInt n = A->rmap->n;
449: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
450: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
452: cusparseTriFactors->workVector = new THRUSTARRAY;
453: cusparseTriFactors->workVector->resize(n);
454: cusparseTriFactors->nnz=a->nz;
456: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
457: /*lower triangular indices */
458: ISGetIndices(isrow,&r);
459: ISIdentity(isrow,&row_identity);
460: if (!row_identity) {
461: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462: cusparseTriFactors->rpermIndices->assign(r, r+n);
463: }
464: ISRestoreIndices(isrow,&r);
466: /*upper triangular indices */
467: ISGetIndices(iscol,&c);
468: ISIdentity(iscol,&col_identity);
469: if (!col_identity) {
470: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471: cusparseTriFactors->cpermIndices->assign(c, c+n);
472: }
473: ISRestoreIndices(iscol,&c);
474: return(0);
475: }
477: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
478: {
479: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
480: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
481: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
482: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
483: cusparseStatus_t stat;
484: PetscErrorCode ierr;
485: PetscInt *AiUp, *AjUp;
486: PetscScalar *AAUp;
487: PetscScalar *AALo;
488: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
489: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
490: const PetscInt *ai = b->i,*aj = b->j,*vj;
491: const MatScalar *aa = b->a,*v;
494: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
495: try {
496: /* Allocate Space for the upper triangular matrix */
497: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
498: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
499: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
502: /* Fill the upper triangular matrix */
503: AiUp[0]=(PetscInt) 0;
504: AiUp[n]=nzUpper;
505: offset = 0;
506: for (i=0; i<n; i++) {
507: /* set the pointers */
508: v = aa + ai[i];
509: vj = aj + ai[i];
510: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
512: /* first, set the diagonal elements */
513: AjUp[offset] = (PetscInt) i;
514: AAUp[offset] = (MatScalar)1.0/v[nz];
515: AiUp[i] = offset;
516: AALo[offset] = (MatScalar)1.0/v[nz];
518: offset+=1;
519: if (nz>0) {
520: PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
521: PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
522: for (j=offset; j<offset+nz; j++) {
523: AAUp[j] = -AAUp[j];
524: AALo[j] = AAUp[j]/v[nz];
525: }
526: offset+=nz;
527: }
528: }
530: /* allocate space for the triangular factor information */
531: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
533: /* Create the matrix description */
534: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
535: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
536: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
537: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
538: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
540: /* Create the solve analysis information */
541: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
543: /* set the operation */
544: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
546: /* set the matrix */
547: upTriFactor->csrMat = new CsrMatrix;
548: upTriFactor->csrMat->num_rows = A->rmap->n;
549: upTriFactor->csrMat->num_cols = A->cmap->n;
550: upTriFactor->csrMat->num_entries = a->nz;
552: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
553: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
555: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
556: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
558: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
559: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
561: /* perform the solve analysis */
562: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
563: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
564: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
565: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
567: /* assign the pointer. Is this really necessary? */
568: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
570: /* allocate space for the triangular factor information */
571: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
573: /* Create the matrix description */
574: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
575: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
576: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
577: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
578: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
580: /* Create the solve analysis information */
581: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
583: /* set the operation */
584: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
586: /* set the matrix */
587: loTriFactor->csrMat = new CsrMatrix;
588: loTriFactor->csrMat->num_rows = A->rmap->n;
589: loTriFactor->csrMat->num_cols = A->cmap->n;
590: loTriFactor->csrMat->num_entries = a->nz;
592: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
593: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
595: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
596: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
598: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
599: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
601: /* perform the solve analysis */
602: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
603: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
604: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
605: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
607: /* assign the pointer. Is this really necessary? */
608: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
610: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
611: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
612: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
613: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
614: cudaFreeHost(AALo);CHKERRCUDA(ierr);
615: } catch(char *ex) {
616: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
617: }
618: }
619: return(0);
620: }
622: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
623: {
624: PetscErrorCode ierr;
625: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
626: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627: IS ip = a->row;
628: const PetscInt *rip;
629: PetscBool perm_identity;
630: PetscInt n = A->rmap->n;
633: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
634: cusparseTriFactors->workVector = new THRUSTARRAY;
635: cusparseTriFactors->workVector->resize(n);
636: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
638: /*lower triangular indices */
639: ISGetIndices(ip,&rip);
640: ISIdentity(ip,&perm_identity);
641: if (!perm_identity) {
642: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
644: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
645: cusparseTriFactors->cpermIndices->assign(rip, rip+n);
646: }
647: ISRestoreIndices(ip,&rip);
648: return(0);
649: }
651: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
652: {
653: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
654: IS isrow = b->row,iscol = b->col;
655: PetscBool row_identity,col_identity;
659: MatLUFactorNumeric_SeqAIJ(B,A,info);
660: /* determine which version of MatSolve needs to be used. */
661: ISIdentity(isrow,&row_identity);
662: ISIdentity(iscol,&col_identity);
663: if (row_identity && col_identity) {
664: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
665: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
666: } else {
667: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
668: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
669: }
671: /* get the triangular factors */
672: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
673: return(0);
674: }
676: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677: {
678: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
679: IS ip = b->row;
680: PetscBool perm_identity;
684: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
686: /* determine which version of MatSolve needs to be used. */
687: ISIdentity(ip,&perm_identity);
688: if (perm_identity) {
689: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691: } else {
692: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694: }
696: /* get the triangular factors */
697: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
698: return(0);
699: }
701: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
702: {
703: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
707: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
708: cusparseStatus_t stat;
709: cusparseIndexBase_t indexBase;
710: cusparseMatrixType_t matrixType;
711: cusparseFillMode_t fillMode;
712: cusparseDiagType_t diagType;
716: /*********************************************/
717: /* Now the Transpose of the Lower Tri Factor */
718: /*********************************************/
720: /* allocate space for the transpose of the lower triangular factor */
721: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
723: /* set the matrix descriptors of the lower triangular factor */
724: matrixType = cusparseGetMatType(loTriFactor->descr);
725: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
726: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
727: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
728: diagType = cusparseGetMatDiagType(loTriFactor->descr);
730: /* Create the matrix description */
731: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
732: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
733: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
734: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
735: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
737: /* Create the solve analysis information */
738: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
740: /* set the operation */
741: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
743: /* allocate GPU space for the CSC of the lower triangular factor*/
744: loTriFactorT->csrMat = new CsrMatrix;
745: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
746: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
747: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
748: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
749: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
750: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
752: /* compute the transpose of the lower triangular factor, i.e. the CSC */
753: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
754: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
755: loTriFactor->csrMat->values->data().get(),
756: loTriFactor->csrMat->row_offsets->data().get(),
757: loTriFactor->csrMat->column_indices->data().get(),
758: loTriFactorT->csrMat->values->data().get(),
759: loTriFactorT->csrMat->column_indices->data().get(),
760: loTriFactorT->csrMat->row_offsets->data().get(),
761: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
763: /* perform the solve analysis on the transposed matrix */
764: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
765: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
766: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
767: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
768: loTriFactorT->solveInfo);CHKERRCUDA(stat);
770: /* assign the pointer. Is this really necessary? */
771: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
773: /*********************************************/
774: /* Now the Transpose of the Upper Tri Factor */
775: /*********************************************/
777: /* allocate space for the transpose of the upper triangular factor */
778: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
780: /* set the matrix descriptors of the upper triangular factor */
781: matrixType = cusparseGetMatType(upTriFactor->descr);
782: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
783: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
784: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
785: diagType = cusparseGetMatDiagType(upTriFactor->descr);
787: /* Create the matrix description */
788: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
789: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
790: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
791: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
792: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
794: /* Create the solve analysis information */
795: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
797: /* set the operation */
798: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
800: /* allocate GPU space for the CSC of the upper triangular factor*/
801: upTriFactorT->csrMat = new CsrMatrix;
802: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
803: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
804: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
805: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
806: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
807: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
809: /* compute the transpose of the upper triangular factor, i.e. the CSC */
810: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
811: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
812: upTriFactor->csrMat->values->data().get(),
813: upTriFactor->csrMat->row_offsets->data().get(),
814: upTriFactor->csrMat->column_indices->data().get(),
815: upTriFactorT->csrMat->values->data().get(),
816: upTriFactorT->csrMat->column_indices->data().get(),
817: upTriFactorT->csrMat->row_offsets->data().get(),
818: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
820: /* perform the solve analysis on the transposed matrix */
821: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
822: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
823: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
824: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
825: upTriFactorT->solveInfo);CHKERRCUDA(stat);
827: /* assign the pointer. Is this really necessary? */
828: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
829: return(0);
830: }
832: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
833: {
834: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
835: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
836: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
837: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
838: cusparseStatus_t stat;
839: cusparseIndexBase_t indexBase;
840: cudaError_t err;
844: /* allocate space for the triangular factor information */
845: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
846: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
847: indexBase = cusparseGetMatIndexBase(matstruct->descr);
848: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
849: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
851: /* set alpha and beta */
852: err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
853: err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854: err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
855: err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
856: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
858: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
859: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
860: CsrMatrix *matrixT= new CsrMatrix;
861: matrixT->num_rows = A->rmap->n;
862: matrixT->num_cols = A->cmap->n;
863: matrixT->num_entries = a->nz;
864: matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
865: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
866: matrixT->values = new THRUSTARRAY(a->nz);
868: /* compute the transpose of the upper triangular factor, i.e. the CSC */
869: indexBase = cusparseGetMatIndexBase(matstruct->descr);
870: stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
871: matrix->num_cols, matrix->num_entries,
872: matrix->values->data().get(),
873: matrix->row_offsets->data().get(),
874: matrix->column_indices->data().get(),
875: matrixT->values->data().get(),
876: matrixT->column_indices->data().get(),
877: matrixT->row_offsets->data().get(),
878: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
880: /* assign the pointer */
881: matstructT->mat = matrixT;
883: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
884: #if CUDA_VERSION>=5000
885: /* First convert HYB to CSR */
886: CsrMatrix *temp= new CsrMatrix;
887: temp->num_rows = A->rmap->n;
888: temp->num_cols = A->cmap->n;
889: temp->num_entries = a->nz;
890: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
891: temp->column_indices = new THRUSTINTARRAY32(a->nz);
892: temp->values = new THRUSTARRAY(a->nz);
895: stat = cusparse_hyb2csr(cusparsestruct->handle,
896: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
897: temp->values->data().get(),
898: temp->row_offsets->data().get(),
899: temp->column_indices->data().get());CHKERRCUDA(stat);
901: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
902: CsrMatrix *tempT= new CsrMatrix;
903: tempT->num_rows = A->rmap->n;
904: tempT->num_cols = A->cmap->n;
905: tempT->num_entries = a->nz;
906: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
908: tempT->values = new THRUSTARRAY(a->nz);
910: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
911: temp->num_cols, temp->num_entries,
912: temp->values->data().get(),
913: temp->row_offsets->data().get(),
914: temp->column_indices->data().get(),
915: tempT->values->data().get(),
916: tempT->column_indices->data().get(),
917: tempT->row_offsets->data().get(),
918: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
920: /* Last, convert CSC to HYB */
921: cusparseHybMat_t hybMat;
922: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
923: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
924: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
925: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
926: matstructT->descr, tempT->values->data().get(),
927: tempT->row_offsets->data().get(),
928: tempT->column_indices->data().get(),
929: hybMat, 0, partition);CHKERRCUDA(stat);
931: /* assign the pointer */
932: matstructT->mat = hybMat;
934: /* delete temporaries */
935: if (tempT) {
936: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
937: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
938: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
939: delete (CsrMatrix*) tempT;
940: }
941: if (temp) {
942: if (temp->values) delete (THRUSTARRAY*) temp->values;
943: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
944: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
945: delete (CsrMatrix*) temp;
946: }
947: #else
948: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
949: #endif
950: }
951: /* assign the compressed row indices */
952: matstructT->cprowIndices = new THRUSTINTARRAY;
954: /* assign the pointer */
955: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
956: return(0);
957: }
959: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
960: {
961: PetscInt n = xx->map->n;
962: const PetscScalar *barray;
963: PetscScalar *xarray;
964: thrust::device_ptr<const PetscScalar> bGPU;
965: thrust::device_ptr<PetscScalar> xGPU;
966: cusparseStatus_t stat;
967: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
968: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
969: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
970: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
971: PetscErrorCode ierr;
974: /* Analyze the matrix and create the transpose ... on the fly */
975: if (!loTriFactorT && !upTriFactorT) {
976: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
977: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
978: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
979: }
981: /* Get the GPU pointers */
982: VecCUDAGetArrayWrite(xx,&xarray);
983: VecCUDAGetArrayRead(bb,&barray);
984: xGPU = thrust::device_pointer_cast(xarray);
985: bGPU = thrust::device_pointer_cast(barray);
987: /* First, reorder with the row permutation */
988: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
989: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
990: xGPU);
992: /* First, solve U */
993: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
994: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
995: upTriFactorT->csrMat->values->data().get(),
996: upTriFactorT->csrMat->row_offsets->data().get(),
997: upTriFactorT->csrMat->column_indices->data().get(),
998: upTriFactorT->solveInfo,
999: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1001: /* Then, solve L */
1002: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1003: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1004: loTriFactorT->csrMat->values->data().get(),
1005: loTriFactorT->csrMat->row_offsets->data().get(),
1006: loTriFactorT->csrMat->column_indices->data().get(),
1007: loTriFactorT->solveInfo,
1008: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1010: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1011: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1012: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1013: tempGPU->begin());
1015: /* Copy the temporary to the full solution. */
1016: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1018: /* restore */
1019: VecCUDARestoreArrayRead(bb,&barray);
1020: VecCUDARestoreArrayWrite(xx,&xarray);
1021: WaitForGPU();CHKERRCUDA(ierr);
1023: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1024: return(0);
1025: }
1027: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1028: {
1029: const PetscScalar *barray;
1030: PetscScalar *xarray;
1031: cusparseStatus_t stat;
1032: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1033: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1034: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1035: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1036: PetscErrorCode ierr;
1039: /* Analyze the matrix and create the transpose ... on the fly */
1040: if (!loTriFactorT && !upTriFactorT) {
1041: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1042: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1043: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1044: }
1046: /* Get the GPU pointers */
1047: VecCUDAGetArrayWrite(xx,&xarray);
1048: VecCUDAGetArrayRead(bb,&barray);
1050: /* First, solve U */
1051: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1052: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1053: upTriFactorT->csrMat->values->data().get(),
1054: upTriFactorT->csrMat->row_offsets->data().get(),
1055: upTriFactorT->csrMat->column_indices->data().get(),
1056: upTriFactorT->solveInfo,
1057: barray, tempGPU->data().get());CHKERRCUDA(stat);
1059: /* Then, solve L */
1060: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1061: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1062: loTriFactorT->csrMat->values->data().get(),
1063: loTriFactorT->csrMat->row_offsets->data().get(),
1064: loTriFactorT->csrMat->column_indices->data().get(),
1065: loTriFactorT->solveInfo,
1066: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1068: /* restore */
1069: VecCUDARestoreArrayRead(bb,&barray);
1070: VecCUDARestoreArrayWrite(xx,&xarray);
1071: WaitForGPU();CHKERRCUDA(ierr);
1072: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1073: return(0);
1074: }
1076: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1077: {
1078: const PetscScalar *barray;
1079: PetscScalar *xarray;
1080: thrust::device_ptr<const PetscScalar> bGPU;
1081: thrust::device_ptr<PetscScalar> xGPU;
1082: cusparseStatus_t stat;
1083: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1084: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1085: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1086: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1087: PetscErrorCode ierr;
1091: /* Get the GPU pointers */
1092: VecCUDAGetArrayWrite(xx,&xarray);
1093: VecCUDAGetArrayRead(bb,&barray);
1094: xGPU = thrust::device_pointer_cast(xarray);
1095: bGPU = thrust::device_pointer_cast(barray);
1097: /* First, reorder with the row permutation */
1098: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1099: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1100: xGPU);
1102: /* Next, solve L */
1103: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1104: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1105: loTriFactor->csrMat->values->data().get(),
1106: loTriFactor->csrMat->row_offsets->data().get(),
1107: loTriFactor->csrMat->column_indices->data().get(),
1108: loTriFactor->solveInfo,
1109: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1111: /* Then, solve U */
1112: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1113: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1114: upTriFactor->csrMat->values->data().get(),
1115: upTriFactor->csrMat->row_offsets->data().get(),
1116: upTriFactor->csrMat->column_indices->data().get(),
1117: upTriFactor->solveInfo,
1118: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1120: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1121: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1122: thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1123: tempGPU->begin());
1125: /* Copy the temporary to the full solution. */
1126: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1128: VecCUDARestoreArrayRead(bb,&barray);
1129: VecCUDARestoreArrayWrite(xx,&xarray);
1130: WaitForGPU();CHKERRCUDA(ierr);
1131: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1132: return(0);
1133: }
1135: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1136: {
1137: const PetscScalar *barray;
1138: PetscScalar *xarray;
1139: cusparseStatus_t stat;
1140: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1141: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1142: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1143: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1144: PetscErrorCode ierr;
1147: /* Get the GPU pointers */
1148: VecCUDAGetArrayWrite(xx,&xarray);
1149: VecCUDAGetArrayRead(bb,&barray);
1151: /* First, solve L */
1152: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1153: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1154: loTriFactor->csrMat->values->data().get(),
1155: loTriFactor->csrMat->row_offsets->data().get(),
1156: loTriFactor->csrMat->column_indices->data().get(),
1157: loTriFactor->solveInfo,
1158: barray, tempGPU->data().get());CHKERRCUDA(stat);
1160: /* Next, solve U */
1161: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1162: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1163: upTriFactor->csrMat->values->data().get(),
1164: upTriFactor->csrMat->row_offsets->data().get(),
1165: upTriFactor->csrMat->column_indices->data().get(),
1166: upTriFactor->solveInfo,
1167: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1169: VecCUDARestoreArrayRead(bb,&barray);
1170: VecCUDARestoreArrayWrite(xx,&xarray);
1171: WaitForGPU();CHKERRCUDA(ierr);
1172: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1173: return(0);
1174: }
1176: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1177: {
1179: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1180: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1181: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1182: PetscInt m = A->rmap->n,*ii,*ridx;
1183: PetscErrorCode ierr;
1184: cusparseStatus_t stat;
1185: cudaError_t err;
1188: if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1189: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1190: if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1191: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1192: /* copy values only */
1193: matrix->values->assign(a->a, a->a+a->nz);
1194: } else {
1195: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1196: try {
1197: cusparsestruct->nonzerorow=0;
1198: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1200: if (a->compressedrow.use) {
1201: m = a->compressedrow.nrows;
1202: ii = a->compressedrow.i;
1203: ridx = a->compressedrow.rindex;
1204: } else {
1205: /* Forcing compressed row on the GPU */
1206: int k=0;
1207: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1208: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1209: ii[0]=0;
1210: for (int j = 0; j<m; j++) {
1211: if ((a->i[j+1]-a->i[j])>0) {
1212: ii[k] = a->i[j];
1213: ridx[k]= j;
1214: k++;
1215: }
1216: }
1217: ii[cusparsestruct->nonzerorow] = a->nz;
1218: m = cusparsestruct->nonzerorow;
1219: }
1221: /* allocate space for the triangular factor information */
1222: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1223: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1224: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1225: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1227: err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1228: err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229: err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1230: err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1233: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1234: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1235: /* set the matrix */
1236: CsrMatrix *matrix= new CsrMatrix;
1237: matrix->num_rows = m;
1238: matrix->num_cols = A->cmap->n;
1239: matrix->num_entries = a->nz;
1240: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1241: matrix->row_offsets->assign(ii, ii + m+1);
1243: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1244: matrix->column_indices->assign(a->j, a->j+a->nz);
1246: matrix->values = new THRUSTARRAY(a->nz);
1247: matrix->values->assign(a->a, a->a+a->nz);
1249: /* assign the pointer */
1250: matstruct->mat = matrix;
1252: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1253: #if CUDA_VERSION>=4020
1254: CsrMatrix *matrix= new CsrMatrix;
1255: matrix->num_rows = m;
1256: matrix->num_cols = A->cmap->n;
1257: matrix->num_entries = a->nz;
1258: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1259: matrix->row_offsets->assign(ii, ii + m+1);
1261: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1262: matrix->column_indices->assign(a->j, a->j+a->nz);
1264: matrix->values = new THRUSTARRAY(a->nz);
1265: matrix->values->assign(a->a, a->a+a->nz);
1267: cusparseHybMat_t hybMat;
1268: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1269: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1270: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1271: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1272: matstruct->descr, matrix->values->data().get(),
1273: matrix->row_offsets->data().get(),
1274: matrix->column_indices->data().get(),
1275: hybMat, 0, partition);CHKERRCUDA(stat);
1276: /* assign the pointer */
1277: matstruct->mat = hybMat;
1279: if (matrix) {
1280: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1281: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1282: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1283: delete (CsrMatrix*)matrix;
1284: }
1285: #endif
1286: }
1288: /* assign the compressed row indices */
1289: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1290: matstruct->cprowIndices->assign(ridx,ridx+m);
1292: /* assign the pointer */
1293: cusparsestruct->mat = matstruct;
1295: if (!a->compressedrow.use) {
1296: PetscFree(ii);
1297: PetscFree(ridx);
1298: }
1299: cusparsestruct->workVector = new THRUSTARRAY;
1300: cusparsestruct->workVector->resize(m);
1301: } catch(char *ex) {
1302: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1303: }
1304: cusparsestruct->nonzerostate = A->nonzerostate;
1305: }
1306: WaitForGPU();CHKERRCUDA(ierr);
1307: A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1308: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1309: }
1310: return(0);
1311: }
1313: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1314: {
1316: PetscInt rbs,cbs;
1319: MatGetBlockSizes(mat,&rbs,&cbs);
1320: if (right) {
1321: VecCreate(PetscObjectComm((PetscObject)mat),right);
1322: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1323: VecSetBlockSize(*right,cbs);
1324: VecSetType(*right,VECSEQCUDA);
1325: PetscLayoutReference(mat->cmap,&(*right)->map);
1326: }
1327: if (left) {
1328: VecCreate(PetscObjectComm((PetscObject)mat),left);
1329: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1330: VecSetBlockSize(*left,rbs);
1331: VecSetType(*left,VECSEQCUDA);
1332: PetscLayoutReference(mat->rmap,&(*left)->map);
1333: }
1334: return(0);
1335: }
1337: struct VecCUDAPlusEquals
1338: {
1339: template <typename Tuple>
1340: __host__ __device__
1341: void operator()(Tuple t)
1342: {
1343: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1344: }
1345: };
1347: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1348: {
1349: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1350: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1351: Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1352: const PetscScalar *xarray;
1353: PetscScalar *yarray;
1354: PetscErrorCode ierr;
1355: cusparseStatus_t stat;
1358: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1359: MatSeqAIJCUSPARSECopyToGPU(A);
1360: VecCUDAGetArrayRead(xx,&xarray);
1361: VecSet(yy,0);
1362: VecCUDAGetArrayWrite(yy,&yarray);
1363: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1364: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1365: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1366: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1367: mat->num_rows, mat->num_cols, mat->num_entries,
1368: matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1369: mat->column_indices->data().get(), xarray, matstruct->beta,
1370: yarray);CHKERRCUDA(stat);
1371: } else {
1372: #if CUDA_VERSION>=4020
1373: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1374: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1375: matstruct->alpha, matstruct->descr, hybMat,
1376: xarray, matstruct->beta,
1377: yarray);CHKERRCUDA(stat);
1378: #endif
1379: }
1380: VecCUDARestoreArrayRead(xx,&xarray);
1381: VecCUDARestoreArrayWrite(yy,&yarray);
1382: if (!cusparsestruct->stream) {
1383: WaitForGPU();CHKERRCUDA(ierr);
1384: }
1385: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1386: return(0);
1387: }
1389: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1390: {
1391: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1392: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1393: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1394: const PetscScalar *xarray;
1395: PetscScalar *yarray;
1396: PetscErrorCode ierr;
1397: cusparseStatus_t stat;
1400: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1401: MatSeqAIJCUSPARSECopyToGPU(A);
1402: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1403: if (!matstructT) {
1404: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1405: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1406: }
1407: VecCUDAGetArrayRead(xx,&xarray);
1408: VecSet(yy,0);
1409: VecCUDAGetArrayWrite(yy,&yarray);
1411: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414: mat->num_rows, mat->num_cols,
1415: mat->num_entries, matstructT->alpha, matstructT->descr,
1416: mat->values->data().get(), mat->row_offsets->data().get(),
1417: mat->column_indices->data().get(), xarray, matstructT->beta,
1418: yarray);CHKERRCUDA(stat);
1419: } else {
1420: #if CUDA_VERSION>=4020
1421: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1422: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423: matstructT->alpha, matstructT->descr, hybMat,
1424: xarray, matstructT->beta,
1425: yarray);CHKERRCUDA(stat);
1426: #endif
1427: }
1428: VecCUDARestoreArrayRead(xx,&xarray);
1429: VecCUDARestoreArrayWrite(yy,&yarray);
1430: if (!cusparsestruct->stream) {
1431: WaitForGPU();CHKERRCUDA(ierr);
1432: }
1433: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1434: return(0);
1435: }
1438: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1439: {
1440: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1441: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1442: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1443: thrust::device_ptr<PetscScalar> zptr;
1444: const PetscScalar *xarray;
1445: PetscScalar *zarray;
1446: PetscErrorCode ierr;
1447: cusparseStatus_t stat;
1450: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1451: MatSeqAIJCUSPARSECopyToGPU(A);
1452: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1453: try {
1454: VecCopy_SeqCUDA(yy,zz);
1455: VecCUDAGetArrayRead(xx,&xarray);
1456: VecCUDAGetArrayReadWrite(zz,&zarray);
1457: zptr = thrust::device_pointer_cast(zarray);
1459: /* multiply add */
1460: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1461: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1462: /* here we need to be careful to set the number of rows in the multiply to the
1463: number of compressed rows in the matrix ... which is equivalent to the
1464: size of the workVector */
1465: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1466: mat->num_rows, mat->num_cols,
1467: mat->num_entries, matstruct->alpha, matstruct->descr,
1468: mat->values->data().get(), mat->row_offsets->data().get(),
1469: mat->column_indices->data().get(), xarray, matstruct->beta,
1470: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1471: } else {
1472: #if CUDA_VERSION>=4020
1473: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1474: if (cusparsestruct->workVector->size()) {
1475: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1476: matstruct->alpha, matstruct->descr, hybMat,
1477: xarray, matstruct->beta,
1478: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1479: }
1480: #endif
1481: }
1483: /* scatter the data from the temporary into the full vector with a += operation */
1484: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1485: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1486: VecCUDAPlusEquals());
1487: VecCUDARestoreArrayRead(xx,&xarray);
1488: VecCUDARestoreArrayReadWrite(zz,&zarray);
1490: } catch(char *ex) {
1491: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1492: }
1493: WaitForGPU();CHKERRCUDA(ierr);
1494: PetscLogFlops(2.0*a->nz);
1495: return(0);
1496: }
1498: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1499: {
1500: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1501: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1502: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1503: thrust::device_ptr<PetscScalar> zptr;
1504: const PetscScalar *xarray;
1505: PetscScalar *zarray;
1506: PetscErrorCode ierr;
1507: cusparseStatus_t stat;
1510: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1511: MatSeqAIJCUSPARSECopyToGPU(A);
1512: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1513: if (!matstructT) {
1514: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1515: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1516: }
1518: try {
1519: VecCopy_SeqCUDA(yy,zz);
1520: VecCUDAGetArrayRead(xx,&xarray);
1521: VecCUDAGetArrayReadWrite(zz,&zarray);
1522: zptr = thrust::device_pointer_cast(zarray);
1524: /* multiply add with matrix transpose */
1525: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1526: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1527: /* here we need to be careful to set the number of rows in the multiply to the
1528: number of compressed rows in the matrix ... which is equivalent to the
1529: size of the workVector */
1530: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1531: mat->num_rows, mat->num_cols,
1532: mat->num_entries, matstructT->alpha, matstructT->descr,
1533: mat->values->data().get(), mat->row_offsets->data().get(),
1534: mat->column_indices->data().get(), xarray, matstructT->beta,
1535: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1536: } else {
1537: #if CUDA_VERSION>=4020
1538: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1539: if (cusparsestruct->workVector->size()) {
1540: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1541: matstructT->alpha, matstructT->descr, hybMat,
1542: xarray, matstructT->beta,
1543: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1544: }
1545: #endif
1546: }
1548: /* scatter the data from the temporary into the full vector with a += operation */
1549: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1550: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1551: VecCUDAPlusEquals());
1553: VecCUDARestoreArrayRead(xx,&xarray);
1554: VecCUDARestoreArrayReadWrite(zz,&zarray);
1556: } catch(char *ex) {
1557: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1558: }
1559: WaitForGPU();CHKERRCUDA(ierr);
1560: PetscLogFlops(2.0*a->nz);
1561: return(0);
1562: }
1564: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1565: {
1569: MatAssemblyEnd_SeqAIJ(A,mode);
1570: if (A->factortype==MAT_FACTOR_NONE) {
1571: MatSeqAIJCUSPARSECopyToGPU(A);
1572: }
1573: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1574: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1575: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1576: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1577: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1578: return(0);
1579: }
1581: /* --------------------------------------------------------------------------------*/
1582: /*@
1583: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1584: (the default parallel PETSc format). This matrix will ultimately pushed down
1585: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1586: assembly performance the user should preallocate the matrix storage by setting
1587: the parameter nz (or the array nnz). By setting these parameters accurately,
1588: performance during matrix assembly can be increased by more than a factor of 50.
1590: Collective on MPI_Comm
1592: Input Parameters:
1593: + comm - MPI communicator, set to PETSC_COMM_SELF
1594: . m - number of rows
1595: . n - number of columns
1596: . nz - number of nonzeros per row (same for all rows)
1597: - nnz - array containing the number of nonzeros in the various rows
1598: (possibly different for each row) or NULL
1600: Output Parameter:
1601: . A - the matrix
1603: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1604: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1605: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1607: Notes:
1608: If nnz is given then nz is ignored
1610: The AIJ format (also called the Yale sparse matrix format or
1611: compressed row storage), is fully compatible with standard Fortran 77
1612: storage. That is, the stored row and column indices can begin at
1613: either one (as in Fortran) or zero. See the users' manual for details.
1615: Specify the preallocated storage with either nz or nnz (not both).
1616: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1617: allocation. For large problems you MUST preallocate memory or you
1618: will get TERRIBLE performance, see the users' manual chapter on matrices.
1620: By default, this format uses inodes (identical nodes) when possible, to
1621: improve numerical efficiency of matrix-vector products and solves. We
1622: search for consecutive rows with the same nonzero structure, thereby
1623: reusing matrix information to achieve increased efficiency.
1625: Level: intermediate
1627: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1628: @*/
1629: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1630: {
1634: MatCreate(comm,A);
1635: MatSetSizes(*A,m,n,m,n);
1636: MatSetType(*A,MATSEQAIJCUSPARSE);
1637: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1638: return(0);
1639: }
1641: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1642: {
1643: PetscErrorCode ierr;
1646: if (A->factortype==MAT_FACTOR_NONE) {
1647: if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1648: Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1649: }
1650: } else {
1651: Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1652: }
1653: MatDestroy_SeqAIJ(A);
1654: return(0);
1655: }
1657: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1658: {
1660: Mat C;
1661: cusparseStatus_t stat;
1662: cusparseHandle_t handle=0;
1665: MatDuplicate_SeqAIJ(A,cpvalues,B);
1666: C = *B;
1667: /* inject CUSPARSE-specific stuff */
1668: if (C->factortype==MAT_FACTOR_NONE) {
1669: /* you cannot check the inode.use flag here since the matrix was just created.
1670: now build a GPU matrix data structure */
1671: C->spptr = new Mat_SeqAIJCUSPARSE;
1672: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1673: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1674: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1675: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1676: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1677: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0;
1678: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1679: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1680: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1681: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1682: } else {
1683: /* NEXT, set the pointers to the triangular factors */
1684: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1685: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1686: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1687: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1688: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1689: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1690: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1691: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1692: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1693: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1694: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1695: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1696: }
1698: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1699: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1700: C->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1701: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1702: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1703: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1704: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1705: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1706: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1708: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1710: C->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1712: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1713: return(0);
1714: }
1716: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1717: {
1719: cusparseStatus_t stat;
1720: cusparseHandle_t handle=0;
1723: MatCreate_SeqAIJ(B);
1724: if (B->factortype==MAT_FACTOR_NONE) {
1725: /* you cannot check the inode.use flag here since the matrix was just created.
1726: now build a GPU matrix data structure */
1727: B->spptr = new Mat_SeqAIJCUSPARSE;
1728: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1729: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1730: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1731: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1732: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1733: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0;
1734: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1735: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1736: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1737: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1738: } else {
1739: /* NEXT, set the pointers to the triangular factors */
1740: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1741: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1742: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1743: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1744: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1745: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1746: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1747: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1748: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0;
1749: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1750: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1751: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1752: }
1754: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1755: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1756: B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1757: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1758: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1759: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1760: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1761: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1762: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1764: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1766: B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1768: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1769: return(0);
1770: }
1772: /*M
1773: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1775: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1776: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1777: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1779: Options Database Keys:
1780: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1781: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1784: Level: beginner
1786: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1787: M*/
1789: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1792: PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1793: {
1797: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1798: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1799: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1800: MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1801: return(0);
1802: }
1805: static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1806: {
1807: cusparseStatus_t stat;
1808: cusparseHandle_t handle;
1811: if (*cusparsestruct) {
1812: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1813: Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1814: delete (*cusparsestruct)->workVector;
1815: if (handle = (*cusparsestruct)->handle) {
1816: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1817: }
1818: delete *cusparsestruct;
1819: *cusparsestruct = 0;
1820: }
1821: return(0);
1822: }
1824: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1825: {
1827: if (*mat) {
1828: delete (*mat)->values;
1829: delete (*mat)->column_indices;
1830: delete (*mat)->row_offsets;
1831: delete *mat;
1832: *mat = 0;
1833: }
1834: return(0);
1835: }
1837: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1838: {
1839: cusparseStatus_t stat;
1840: PetscErrorCode ierr;
1843: if (*trifactor) {
1844: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1845: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1846: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1847: delete *trifactor;
1848: *trifactor = 0;
1849: }
1850: return(0);
1851: }
1853: static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1854: {
1855: CsrMatrix *mat;
1856: cusparseStatus_t stat;
1857: cudaError_t err;
1860: if (*matstruct) {
1861: if ((*matstruct)->mat) {
1862: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1863: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1864: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1865: } else {
1866: mat = (CsrMatrix*)(*matstruct)->mat;
1867: CsrMatrix_Destroy(&mat);
1868: }
1869: }
1870: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1871: delete (*matstruct)->cprowIndices;
1872: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1873: if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1874: delete *matstruct;
1875: *matstruct = 0;
1876: }
1877: return(0);
1878: }
1880: static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1881: {
1882: cusparseHandle_t handle;
1883: cusparseStatus_t stat;
1886: if (*trifactors) {
1887: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1888: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1889: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1890: Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1891: delete (*trifactors)->rpermIndices;
1892: delete (*trifactors)->cpermIndices;
1893: delete (*trifactors)->workVector;
1894: if (handle = (*trifactors)->handle) {
1895: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1896: }
1897: delete *trifactors;
1898: *trifactors = 0;
1899: }
1900: return(0);
1901: }