Actual source code: aijcusparse.cu
petsc-3.9.4 2018-09-11
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42: {
43: cusparseStatus_t stat;
44: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47: cusparsestruct->stream = stream;
48: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49: return(0);
50: }
52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53: {
54: cusparseStatus_t stat;
55: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58: if (cusparsestruct->handle != handle) {
59: if (cusparsestruct->handle) {
60: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61: }
62: cusparsestruct->handle = handle;
63: }
64: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65: return(0);
66: }
68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69: {
70: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72: if (cusparsestruct->handle)
73: cusparsestruct->handle = 0;
74: return(0);
75: }
77: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78: {
80: *type = MATSOLVERCUSPARSE;
81: return(0);
82: }
84: /*MC
85: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90: algorithms are not recommended. This class does NOT support direct solver operations.
92: Level: beginner
94: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95: M*/
97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98: {
100: PetscInt n = A->rmap->n;
103: MatCreate(PetscObjectComm((PetscObject)A),B);
104: (*B)->factortype = ftype;
105: MatSetSizes(*B,n,n,n,n);
106: MatSetType(*B,MATSEQAIJCUSPARSE);
108: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109: MatSetBlockSizesFromMats(*B,A,A);
110: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127: #if CUDA_VERSION>=4020
128: switch (op) {
129: case MAT_CUSPARSE_MULT:
130: cusparsestruct->format = format;
131: break;
132: case MAT_CUSPARSE_ALL:
133: cusparsestruct->format = format;
134: break;
135: default:
136: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137: }
138: #else
139: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141: return(0);
142: }
144: /*@
145: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146: operation. Only the MatMult operation can use different GPU storage formats
147: for MPIAIJCUSPARSE matrices.
148: Not Collective
150: Input Parameters:
151: + A - Matrix of type SEQAIJCUSPARSE
152: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
155: Output Parameter:
157: Level: intermediate
159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {
167: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168: return(0);
169: }
171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173: PetscErrorCode ierr;
174: MatCUSPARSEStorageFormat format;
175: PetscBool flg;
176: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
179: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180: PetscObjectOptionsBegin((PetscObject)A);
181: if (A->factortype==MAT_FACTOR_NONE) {
182: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
184: if (flg) {
185: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
186: }
187: }
188: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
190: if (flg) {
191: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
192: }
193: PetscOptionsEnd();
194: return(0);
196: }
198: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199: {
203: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
204: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205: return(0);
206: }
208: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209: {
213: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
214: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215: return(0);
216: }
218: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219: {
223: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
224: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225: return(0);
226: }
228: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229: {
233: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
234: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235: return(0);
236: }
238: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239: {
240: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
241: PetscInt n = A->rmap->n;
242: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244: cusparseStatus_t stat;
245: const PetscInt *ai = a->i,*aj = a->j,*vi;
246: const MatScalar *aa = a->a,*v;
247: PetscInt *AiLo, *AjLo;
248: PetscScalar *AALo;
249: PetscInt i,nz, nzLower, offset, rowOffset;
250: PetscErrorCode ierr;
253: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
254: try {
255: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256: nzLower=n+ai[n]-ai[1];
258: /* Allocate Space for the lower triangular matrix */
259: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
263: /* Fill the lower triangular matrix */
264: AiLo[0] = (PetscInt) 0;
265: AiLo[n] = nzLower;
266: AjLo[0] = (PetscInt) 0;
267: AALo[0] = (MatScalar) 1.0;
268: v = aa;
269: vi = aj;
270: offset = 1;
271: rowOffset= 1;
272: for (i=1; i<n; i++) {
273: nz = ai[i+1] - ai[i];
274: /* additional 1 for the term on the diagonal */
275: AiLo[i] = rowOffset;
276: rowOffset += nz+1;
278: PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
279: PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));
281: offset += nz;
282: AjLo[offset] = (PetscInt) i;
283: AALo[offset] = (MatScalar) 1.0;
284: offset += 1;
286: v += nz;
287: vi += nz;
288: }
290: /* allocate space for the triangular factor information */
291: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
293: /* Create the matrix description */
294: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
300: /* Create the solve analysis information */
301: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
303: /* set the operation */
304: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
306: /* set the matrix */
307: loTriFactor->csrMat = new CsrMatrix;
308: loTriFactor->csrMat->num_rows = n;
309: loTriFactor->csrMat->num_cols = n;
310: loTriFactor->csrMat->num_entries = nzLower;
312: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
315: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
318: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
321: /* perform the solve analysis */
322: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
327: /* assign the pointer. Is this really necessary? */
328: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
330: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332: cudaFreeHost(AALo);CHKERRCUDA(ierr);
333: } catch(char *ex) {
334: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335: }
336: }
337: return(0);
338: }
340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
343: PetscInt n = A->rmap->n;
344: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346: cusparseStatus_t stat;
347: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
348: const MatScalar *aa = a->a,*v;
349: PetscInt *AiUp, *AjUp;
350: PetscScalar *AAUp;
351: PetscInt i,nz, nzUpper, offset;
352: PetscErrorCode ierr;
355: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356: try {
357: /* next, figure out the number of nonzeros in the upper triangular matrix. */
358: nzUpper = adiag[0]-adiag[n];
360: /* Allocate Space for the upper triangular matrix */
361: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
365: /* Fill the upper triangular matrix */
366: AiUp[0]=(PetscInt) 0;
367: AiUp[n]=nzUpper;
368: offset = nzUpper;
369: for (i=n-1; i>=0; i--) {
370: v = aa + adiag[i+1] + 1;
371: vi = aj + adiag[i+1] + 1;
373: /* number of elements NOT on the diagonal */
374: nz = adiag[i] - adiag[i+1]-1;
376: /* decrement the offset */
377: offset -= (nz+1);
379: /* first, set the diagonal elements */
380: AjUp[offset] = (PetscInt) i;
381: AAUp[offset] = (MatScalar)1./v[nz];
382: AiUp[i] = AiUp[i+1] - (nz+1);
384: PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
385: PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
386: }
388: /* allocate space for the triangular factor information */
389: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391: /* Create the matrix description */
392: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
398: /* Create the solve analysis information */
399: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
401: /* set the operation */
402: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
404: /* set the matrix */
405: upTriFactor->csrMat = new CsrMatrix;
406: upTriFactor->csrMat->num_rows = n;
407: upTriFactor->csrMat->num_cols = n;
408: upTriFactor->csrMat->num_entries = nzUpper;
410: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
413: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
416: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
419: /* perform the solve analysis */
420: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
425: /* assign the pointer. Is this really necessary? */
426: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
428: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431: } catch(char *ex) {
432: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433: }
434: }
435: return(0);
436: }
438: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439: {
440: PetscErrorCode ierr;
441: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
442: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443: IS isrow = a->row,iscol = a->icol;
444: PetscBool row_identity,col_identity;
445: const PetscInt *r,*c;
446: PetscInt n = A->rmap->n;
449: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
450: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
452: cusparseTriFactors->workVector = new THRUSTARRAY;
453: cusparseTriFactors->workVector->resize(n);
454: cusparseTriFactors->nnz=a->nz;
456: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457: /*lower triangular indices */
458: ISGetIndices(isrow,&r);
459: ISIdentity(isrow,&row_identity);
460: if (!row_identity) {
461: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462: cusparseTriFactors->rpermIndices->assign(r, r+n);
463: }
464: ISRestoreIndices(isrow,&r);
466: /*upper triangular indices */
467: ISGetIndices(iscol,&c);
468: ISIdentity(iscol,&col_identity);
469: if (!col_identity) {
470: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471: cusparseTriFactors->cpermIndices->assign(c, c+n);
472: }
473: ISRestoreIndices(iscol,&c);
474: return(0);
475: }
477: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
478: {
479: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
480: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
481: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
482: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
483: cusparseStatus_t stat;
484: PetscErrorCode ierr;
485: PetscInt *AiUp, *AjUp;
486: PetscScalar *AAUp;
487: PetscScalar *AALo;
488: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
489: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
490: const PetscInt *ai = b->i,*aj = b->j,*vj;
491: const MatScalar *aa = b->a,*v;
494: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
495: try {
496: /* Allocate Space for the upper triangular matrix */
497: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
498: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
499: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
502: /* Fill the upper triangular matrix */
503: AiUp[0]=(PetscInt) 0;
504: AiUp[n]=nzUpper;
505: offset = 0;
506: for (i=0; i<n; i++) {
507: /* set the pointers */
508: v = aa + ai[i];
509: vj = aj + ai[i];
510: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
512: /* first, set the diagonal elements */
513: AjUp[offset] = (PetscInt) i;
514: AAUp[offset] = (MatScalar)1.0/v[nz];
515: AiUp[i] = offset;
516: AALo[offset] = (MatScalar)1.0/v[nz];
518: offset+=1;
519: if (nz>0) {
520: PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
521: PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
522: for (j=offset; j<offset+nz; j++) {
523: AAUp[j] = -AAUp[j];
524: AALo[j] = AAUp[j]/v[nz];
525: }
526: offset+=nz;
527: }
528: }
530: /* allocate space for the triangular factor information */
531: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
533: /* Create the matrix description */
534: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
535: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
536: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
537: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
538: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
540: /* Create the solve analysis information */
541: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
543: /* set the operation */
544: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
546: /* set the matrix */
547: upTriFactor->csrMat = new CsrMatrix;
548: upTriFactor->csrMat->num_rows = A->rmap->n;
549: upTriFactor->csrMat->num_cols = A->cmap->n;
550: upTriFactor->csrMat->num_entries = a->nz;
552: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
553: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
555: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
556: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
558: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
559: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
561: /* perform the solve analysis */
562: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
563: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
564: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
565: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
567: /* assign the pointer. Is this really necessary? */
568: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
570: /* allocate space for the triangular factor information */
571: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
573: /* Create the matrix description */
574: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
575: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
576: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
577: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
578: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
580: /* Create the solve analysis information */
581: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
583: /* set the operation */
584: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
586: /* set the matrix */
587: loTriFactor->csrMat = new CsrMatrix;
588: loTriFactor->csrMat->num_rows = A->rmap->n;
589: loTriFactor->csrMat->num_cols = A->cmap->n;
590: loTriFactor->csrMat->num_entries = a->nz;
592: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
593: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
595: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
596: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
598: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
599: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
601: /* perform the solve analysis */
602: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
603: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
604: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
605: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
607: /* assign the pointer. Is this really necessary? */
608: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
610: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
611: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
612: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
613: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
614: cudaFreeHost(AALo);CHKERRCUDA(ierr);
615: } catch(char *ex) {
616: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
617: }
618: }
619: return(0);
620: }
622: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
623: {
624: PetscErrorCode ierr;
625: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
626: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627: IS ip = a->row;
628: const PetscInt *rip;
629: PetscBool perm_identity;
630: PetscInt n = A->rmap->n;
633: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
634: cusparseTriFactors->workVector = new THRUSTARRAY;
635: cusparseTriFactors->workVector->resize(n);
636: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
638: /*lower triangular indices */
639: ISGetIndices(ip,&rip);
640: ISIdentity(ip,&perm_identity);
641: if (!perm_identity) {
642: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
644: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
645: cusparseTriFactors->cpermIndices->assign(rip, rip+n);
646: }
647: ISRestoreIndices(ip,&rip);
648: return(0);
649: }
651: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
652: {
653: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
654: IS isrow = b->row,iscol = b->col;
655: PetscBool row_identity,col_identity;
659: MatLUFactorNumeric_SeqAIJ(B,A,info);
660: /* determine which version of MatSolve needs to be used. */
661: ISIdentity(isrow,&row_identity);
662: ISIdentity(iscol,&col_identity);
663: if (row_identity && col_identity) {
664: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
665: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
666: } else {
667: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
668: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
669: }
671: /* get the triangular factors */
672: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
673: return(0);
674: }
676: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677: {
678: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
679: IS ip = b->row;
680: PetscBool perm_identity;
684: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
686: /* determine which version of MatSolve needs to be used. */
687: ISIdentity(ip,&perm_identity);
688: if (perm_identity) {
689: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691: } else {
692: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694: }
696: /* get the triangular factors */
697: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
698: return(0);
699: }
701: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
702: {
703: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
707: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
708: cusparseStatus_t stat;
709: cusparseIndexBase_t indexBase;
710: cusparseMatrixType_t matrixType;
711: cusparseFillMode_t fillMode;
712: cusparseDiagType_t diagType;
716: /*********************************************/
717: /* Now the Transpose of the Lower Tri Factor */
718: /*********************************************/
720: /* allocate space for the transpose of the lower triangular factor */
721: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
723: /* set the matrix descriptors of the lower triangular factor */
724: matrixType = cusparseGetMatType(loTriFactor->descr);
725: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
726: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
727: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
728: diagType = cusparseGetMatDiagType(loTriFactor->descr);
730: /* Create the matrix description */
731: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
732: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
733: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
734: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
735: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
737: /* Create the solve analysis information */
738: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
740: /* set the operation */
741: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
743: /* allocate GPU space for the CSC of the lower triangular factor*/
744: loTriFactorT->csrMat = new CsrMatrix;
745: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
746: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
747: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
748: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
749: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
750: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
752: /* compute the transpose of the lower triangular factor, i.e. the CSC */
753: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
754: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
755: loTriFactor->csrMat->values->data().get(),
756: loTriFactor->csrMat->row_offsets->data().get(),
757: loTriFactor->csrMat->column_indices->data().get(),
758: loTriFactorT->csrMat->values->data().get(),
759: loTriFactorT->csrMat->column_indices->data().get(),
760: loTriFactorT->csrMat->row_offsets->data().get(),
761: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
763: /* perform the solve analysis on the transposed matrix */
764: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
765: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
766: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
767: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
768: loTriFactorT->solveInfo);CHKERRCUDA(stat);
770: /* assign the pointer. Is this really necessary? */
771: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
773: /*********************************************/
774: /* Now the Transpose of the Upper Tri Factor */
775: /*********************************************/
777: /* allocate space for the transpose of the upper triangular factor */
778: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
780: /* set the matrix descriptors of the upper triangular factor */
781: matrixType = cusparseGetMatType(upTriFactor->descr);
782: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
783: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
784: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
785: diagType = cusparseGetMatDiagType(upTriFactor->descr);
787: /* Create the matrix description */
788: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
789: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
790: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
791: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
792: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
794: /* Create the solve analysis information */
795: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
797: /* set the operation */
798: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
800: /* allocate GPU space for the CSC of the upper triangular factor*/
801: upTriFactorT->csrMat = new CsrMatrix;
802: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
803: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
804: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
805: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
806: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
807: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
809: /* compute the transpose of the upper triangular factor, i.e. the CSC */
810: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
811: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
812: upTriFactor->csrMat->values->data().get(),
813: upTriFactor->csrMat->row_offsets->data().get(),
814: upTriFactor->csrMat->column_indices->data().get(),
815: upTriFactorT->csrMat->values->data().get(),
816: upTriFactorT->csrMat->column_indices->data().get(),
817: upTriFactorT->csrMat->row_offsets->data().get(),
818: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
820: /* perform the solve analysis on the transposed matrix */
821: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
822: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
823: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
824: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
825: upTriFactorT->solveInfo);CHKERRCUDA(stat);
827: /* assign the pointer. Is this really necessary? */
828: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
829: return(0);
830: }
832: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
833: {
834: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
835: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
836: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
837: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
838: cusparseStatus_t stat;
839: cusparseIndexBase_t indexBase;
840: cudaError_t err;
844: /* allocate space for the triangular factor information */
845: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
846: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
847: indexBase = cusparseGetMatIndexBase(matstruct->descr);
848: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
849: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
851: /* set alpha and beta */
852: err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
853: err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854: err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
855: err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
856: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
858: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
859: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
860: CsrMatrix *matrixT= new CsrMatrix;
861: matrixT->num_rows = A->cmap->n;
862: matrixT->num_cols = A->rmap->n;
863: matrixT->num_entries = a->nz;
864: matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
865: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
866: matrixT->values = new THRUSTARRAY(a->nz);
868: /* compute the transpose of the upper triangular factor, i.e. the CSC */
869: indexBase = cusparseGetMatIndexBase(matstruct->descr);
870: stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
871: matrix->num_cols, matrix->num_entries,
872: matrix->values->data().get(),
873: matrix->row_offsets->data().get(),
874: matrix->column_indices->data().get(),
875: matrixT->values->data().get(),
876: matrixT->column_indices->data().get(),
877: matrixT->row_offsets->data().get(),
878: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
880: /* assign the pointer */
881: matstructT->mat = matrixT;
883: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
884: #if CUDA_VERSION>=5000
885: /* First convert HYB to CSR */
886: CsrMatrix *temp= new CsrMatrix;
887: temp->num_rows = A->rmap->n;
888: temp->num_cols = A->cmap->n;
889: temp->num_entries = a->nz;
890: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
891: temp->column_indices = new THRUSTINTARRAY32(a->nz);
892: temp->values = new THRUSTARRAY(a->nz);
895: stat = cusparse_hyb2csr(cusparsestruct->handle,
896: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
897: temp->values->data().get(),
898: temp->row_offsets->data().get(),
899: temp->column_indices->data().get());CHKERRCUDA(stat);
901: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
902: CsrMatrix *tempT= new CsrMatrix;
903: tempT->num_rows = A->rmap->n;
904: tempT->num_cols = A->cmap->n;
905: tempT->num_entries = a->nz;
906: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
908: tempT->values = new THRUSTARRAY(a->nz);
910: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
911: temp->num_cols, temp->num_entries,
912: temp->values->data().get(),
913: temp->row_offsets->data().get(),
914: temp->column_indices->data().get(),
915: tempT->values->data().get(),
916: tempT->column_indices->data().get(),
917: tempT->row_offsets->data().get(),
918: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
920: /* Last, convert CSC to HYB */
921: cusparseHybMat_t hybMat;
922: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
923: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
924: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
925: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
926: matstructT->descr, tempT->values->data().get(),
927: tempT->row_offsets->data().get(),
928: tempT->column_indices->data().get(),
929: hybMat, 0, partition);CHKERRCUDA(stat);
931: /* assign the pointer */
932: matstructT->mat = hybMat;
934: /* delete temporaries */
935: if (tempT) {
936: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
937: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
938: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
939: delete (CsrMatrix*) tempT;
940: }
941: if (temp) {
942: if (temp->values) delete (THRUSTARRAY*) temp->values;
943: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
944: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
945: delete (CsrMatrix*) temp;
946: }
947: #else
948: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
949: #endif
950: }
951: /* assign the compressed row indices */
952: matstructT->cprowIndices = new THRUSTINTARRAY;
953: matstructT->cprowIndices->resize(A->cmap->n);
954: thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
956: /* assign the pointer */
957: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
958: return(0);
959: }
961: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
962: {
963: PetscInt n = xx->map->n;
964: const PetscScalar *barray;
965: PetscScalar *xarray;
966: thrust::device_ptr<const PetscScalar> bGPU;
967: thrust::device_ptr<PetscScalar> xGPU;
968: cusparseStatus_t stat;
969: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
970: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
971: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
972: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
973: PetscErrorCode ierr;
976: /* Analyze the matrix and create the transpose ... on the fly */
977: if (!loTriFactorT && !upTriFactorT) {
978: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
979: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
980: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
981: }
983: /* Get the GPU pointers */
984: VecCUDAGetArrayWrite(xx,&xarray);
985: VecCUDAGetArrayRead(bb,&barray);
986: xGPU = thrust::device_pointer_cast(xarray);
987: bGPU = thrust::device_pointer_cast(barray);
989: /* First, reorder with the row permutation */
990: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
991: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
992: xGPU);
994: /* First, solve U */
995: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
996: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
997: upTriFactorT->csrMat->values->data().get(),
998: upTriFactorT->csrMat->row_offsets->data().get(),
999: upTriFactorT->csrMat->column_indices->data().get(),
1000: upTriFactorT->solveInfo,
1001: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1003: /* Then, solve L */
1004: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1005: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1006: loTriFactorT->csrMat->values->data().get(),
1007: loTriFactorT->csrMat->row_offsets->data().get(),
1008: loTriFactorT->csrMat->column_indices->data().get(),
1009: loTriFactorT->solveInfo,
1010: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1012: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1013: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1014: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1015: tempGPU->begin());
1017: /* Copy the temporary to the full solution. */
1018: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1020: /* restore */
1021: VecCUDARestoreArrayRead(bb,&barray);
1022: VecCUDARestoreArrayWrite(xx,&xarray);
1023: WaitForGPU();CHKERRCUDA(ierr);
1025: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1026: return(0);
1027: }
1029: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1030: {
1031: const PetscScalar *barray;
1032: PetscScalar *xarray;
1033: cusparseStatus_t stat;
1034: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1035: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1036: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1037: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1038: PetscErrorCode ierr;
1041: /* Analyze the matrix and create the transpose ... on the fly */
1042: if (!loTriFactorT && !upTriFactorT) {
1043: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1044: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046: }
1048: /* Get the GPU pointers */
1049: VecCUDAGetArrayWrite(xx,&xarray);
1050: VecCUDAGetArrayRead(bb,&barray);
1052: /* First, solve U */
1053: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054: upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1055: upTriFactorT->csrMat->values->data().get(),
1056: upTriFactorT->csrMat->row_offsets->data().get(),
1057: upTriFactorT->csrMat->column_indices->data().get(),
1058: upTriFactorT->solveInfo,
1059: barray, tempGPU->data().get());CHKERRCUDA(stat);
1061: /* Then, solve L */
1062: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1063: loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1064: loTriFactorT->csrMat->values->data().get(),
1065: loTriFactorT->csrMat->row_offsets->data().get(),
1066: loTriFactorT->csrMat->column_indices->data().get(),
1067: loTriFactorT->solveInfo,
1068: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1070: /* restore */
1071: VecCUDARestoreArrayRead(bb,&barray);
1072: VecCUDARestoreArrayWrite(xx,&xarray);
1073: WaitForGPU();CHKERRCUDA(ierr);
1074: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1075: return(0);
1076: }
1078: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1079: {
1080: const PetscScalar *barray;
1081: PetscScalar *xarray;
1082: thrust::device_ptr<const PetscScalar> bGPU;
1083: thrust::device_ptr<PetscScalar> xGPU;
1084: cusparseStatus_t stat;
1085: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1086: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1087: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1088: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1089: PetscErrorCode ierr;
1093: /* Get the GPU pointers */
1094: VecCUDAGetArrayWrite(xx,&xarray);
1095: VecCUDAGetArrayRead(bb,&barray);
1096: xGPU = thrust::device_pointer_cast(xarray);
1097: bGPU = thrust::device_pointer_cast(barray);
1099: /* First, reorder with the row permutation */
1100: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1101: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1102: xGPU);
1104: /* Next, solve L */
1105: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1106: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1107: loTriFactor->csrMat->values->data().get(),
1108: loTriFactor->csrMat->row_offsets->data().get(),
1109: loTriFactor->csrMat->column_indices->data().get(),
1110: loTriFactor->solveInfo,
1111: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1113: /* Then, solve U */
1114: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1115: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1116: upTriFactor->csrMat->values->data().get(),
1117: upTriFactor->csrMat->row_offsets->data().get(),
1118: upTriFactor->csrMat->column_indices->data().get(),
1119: upTriFactor->solveInfo,
1120: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1122: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1124: thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1125: tempGPU->begin());
1127: /* Copy the temporary to the full solution. */
1128: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1130: VecCUDARestoreArrayRead(bb,&barray);
1131: VecCUDARestoreArrayWrite(xx,&xarray);
1132: WaitForGPU();CHKERRCUDA(ierr);
1133: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1134: return(0);
1135: }
1137: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1138: {
1139: const PetscScalar *barray;
1140: PetscScalar *xarray;
1141: cusparseStatus_t stat;
1142: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1143: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1144: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1145: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1146: PetscErrorCode ierr;
1149: /* Get the GPU pointers */
1150: VecCUDAGetArrayWrite(xx,&xarray);
1151: VecCUDAGetArrayRead(bb,&barray);
1153: /* First, solve L */
1154: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1155: loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1156: loTriFactor->csrMat->values->data().get(),
1157: loTriFactor->csrMat->row_offsets->data().get(),
1158: loTriFactor->csrMat->column_indices->data().get(),
1159: loTriFactor->solveInfo,
1160: barray, tempGPU->data().get());CHKERRCUDA(stat);
1162: /* Next, solve U */
1163: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1164: upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1165: upTriFactor->csrMat->values->data().get(),
1166: upTriFactor->csrMat->row_offsets->data().get(),
1167: upTriFactor->csrMat->column_indices->data().get(),
1168: upTriFactor->solveInfo,
1169: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1171: VecCUDARestoreArrayRead(bb,&barray);
1172: VecCUDARestoreArrayWrite(xx,&xarray);
1173: WaitForGPU();CHKERRCUDA(ierr);
1174: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1175: return(0);
1176: }
1178: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1179: {
1181: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1182: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1183: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1184: PetscInt m = A->rmap->n,*ii,*ridx;
1185: PetscErrorCode ierr;
1186: cusparseStatus_t stat;
1187: cudaError_t err;
1190: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1191: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1192: if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1193: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1194: /* copy values only */
1195: matrix->values->assign(a->a, a->a+a->nz);
1196: } else {
1197: MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1198: try {
1199: cusparsestruct->nonzerorow=0;
1200: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1202: if (a->compressedrow.use) {
1203: m = a->compressedrow.nrows;
1204: ii = a->compressedrow.i;
1205: ridx = a->compressedrow.rindex;
1206: } else {
1207: /* Forcing compressed row on the GPU */
1208: int k=0;
1209: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1210: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1211: ii[0]=0;
1212: for (int j = 0; j<m; j++) {
1213: if ((a->i[j+1]-a->i[j])>0) {
1214: ii[k] = a->i[j];
1215: ridx[k]= j;
1216: k++;
1217: }
1218: }
1219: ii[cusparsestruct->nonzerorow] = a->nz;
1220: m = cusparsestruct->nonzerorow;
1221: }
1223: /* allocate space for the triangular factor information */
1224: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1225: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1226: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1227: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1229: err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1230: err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231: err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1232: err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1235: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1236: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1237: /* set the matrix */
1238: CsrMatrix *matrix= new CsrMatrix;
1239: matrix->num_rows = m;
1240: matrix->num_cols = A->cmap->n;
1241: matrix->num_entries = a->nz;
1242: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1243: matrix->row_offsets->assign(ii, ii + m+1);
1245: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1246: matrix->column_indices->assign(a->j, a->j+a->nz);
1248: matrix->values = new THRUSTARRAY(a->nz);
1249: matrix->values->assign(a->a, a->a+a->nz);
1251: /* assign the pointer */
1252: matstruct->mat = matrix;
1254: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1255: #if CUDA_VERSION>=4020
1256: CsrMatrix *matrix= new CsrMatrix;
1257: matrix->num_rows = m;
1258: matrix->num_cols = A->cmap->n;
1259: matrix->num_entries = a->nz;
1260: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1261: matrix->row_offsets->assign(ii, ii + m+1);
1263: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1264: matrix->column_indices->assign(a->j, a->j+a->nz);
1266: matrix->values = new THRUSTARRAY(a->nz);
1267: matrix->values->assign(a->a, a->a+a->nz);
1269: cusparseHybMat_t hybMat;
1270: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1271: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1272: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1273: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1274: matstruct->descr, matrix->values->data().get(),
1275: matrix->row_offsets->data().get(),
1276: matrix->column_indices->data().get(),
1277: hybMat, 0, partition);CHKERRCUDA(stat);
1278: /* assign the pointer */
1279: matstruct->mat = hybMat;
1281: if (matrix) {
1282: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1283: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1284: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1285: delete (CsrMatrix*)matrix;
1286: }
1287: #endif
1288: }
1290: /* assign the compressed row indices */
1291: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1292: matstruct->cprowIndices->assign(ridx,ridx+m);
1294: /* assign the pointer */
1295: cusparsestruct->mat = matstruct;
1297: if (!a->compressedrow.use) {
1298: PetscFree(ii);
1299: PetscFree(ridx);
1300: }
1301: cusparsestruct->workVector = new THRUSTARRAY;
1302: cusparsestruct->workVector->resize(m);
1303: } catch(char *ex) {
1304: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305: }
1306: cusparsestruct->nonzerostate = A->nonzerostate;
1307: }
1308: WaitForGPU();CHKERRCUDA(ierr);
1309: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1311: }
1312: return(0);
1313: }
1315: static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1316: {
1318: PetscInt rbs,cbs;
1321: MatGetBlockSizes(mat,&rbs,&cbs);
1322: if (right) {
1323: VecCreate(PetscObjectComm((PetscObject)mat),right);
1324: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1325: VecSetBlockSize(*right,cbs);
1326: VecSetType(*right,VECSEQCUDA);
1327: PetscLayoutReference(mat->cmap,&(*right)->map);
1328: }
1329: if (left) {
1330: VecCreate(PetscObjectComm((PetscObject)mat),left);
1331: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1332: VecSetBlockSize(*left,rbs);
1333: VecSetType(*left,VECSEQCUDA);
1334: PetscLayoutReference(mat->rmap,&(*left)->map);
1335: }
1336: return(0);
1337: }
1339: struct VecCUDAPlusEquals
1340: {
1341: template <typename Tuple>
1342: __host__ __device__
1343: void operator()(Tuple t)
1344: {
1345: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1346: }
1347: };
1349: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1350: {
1351: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1352: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1353: Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1354: const PetscScalar *xarray;
1355: PetscScalar *yarray;
1356: PetscErrorCode ierr;
1357: cusparseStatus_t stat;
1360: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1361: MatSeqAIJCUSPARSECopyToGPU(A);
1362: VecCUDAGetArrayRead(xx,&xarray);
1363: VecSet(yy,0);
1364: VecCUDAGetArrayWrite(yy,&yarray);
1365: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1366: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1367: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1368: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1369: mat->num_rows, mat->num_cols, mat->num_entries,
1370: matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1371: mat->column_indices->data().get(), xarray, matstruct->beta,
1372: yarray);CHKERRCUDA(stat);
1373: } else {
1374: #if CUDA_VERSION>=4020
1375: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1376: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1377: matstruct->alpha, matstruct->descr, hybMat,
1378: xarray, matstruct->beta,
1379: yarray);CHKERRCUDA(stat);
1380: #endif
1381: }
1382: VecCUDARestoreArrayRead(xx,&xarray);
1383: VecCUDARestoreArrayWrite(yy,&yarray);
1384: if (!cusparsestruct->stream) {
1385: WaitForGPU();CHKERRCUDA(ierr);
1386: }
1387: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1388: return(0);
1389: }
1391: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1392: {
1393: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1394: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1395: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1396: const PetscScalar *xarray;
1397: PetscScalar *yarray;
1398: PetscErrorCode ierr;
1399: cusparseStatus_t stat;
1402: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1403: MatSeqAIJCUSPARSECopyToGPU(A);
1404: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1405: if (!matstructT) {
1406: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1407: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1408: }
1409: VecCUDAGetArrayRead(xx,&xarray);
1410: VecSet(yy,0);
1411: VecCUDAGetArrayWrite(yy,&yarray);
1413: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1414: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1415: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1416: mat->num_rows, mat->num_cols,
1417: mat->num_entries, matstructT->alpha, matstructT->descr,
1418: mat->values->data().get(), mat->row_offsets->data().get(),
1419: mat->column_indices->data().get(), xarray, matstructT->beta,
1420: yarray);CHKERRCUDA(stat);
1421: } else {
1422: #if CUDA_VERSION>=4020
1423: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1424: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1425: matstructT->alpha, matstructT->descr, hybMat,
1426: xarray, matstructT->beta,
1427: yarray);CHKERRCUDA(stat);
1428: #endif
1429: }
1430: VecCUDARestoreArrayRead(xx,&xarray);
1431: VecCUDARestoreArrayWrite(yy,&yarray);
1432: if (!cusparsestruct->stream) {
1433: WaitForGPU();CHKERRCUDA(ierr);
1434: }
1435: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1436: return(0);
1437: }
1440: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1441: {
1442: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1443: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1444: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1445: thrust::device_ptr<PetscScalar> zptr;
1446: const PetscScalar *xarray;
1447: PetscScalar *zarray;
1448: PetscErrorCode ierr;
1449: cusparseStatus_t stat;
1452: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1453: MatSeqAIJCUSPARSECopyToGPU(A);
1454: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1455: try {
1456: VecCopy_SeqCUDA(yy,zz);
1457: VecCUDAGetArrayRead(xx,&xarray);
1458: VecCUDAGetArrayReadWrite(zz,&zarray);
1459: zptr = thrust::device_pointer_cast(zarray);
1461: /* multiply add */
1462: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1463: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1464: /* here we need to be careful to set the number of rows in the multiply to the
1465: number of compressed rows in the matrix ... which is equivalent to the
1466: size of the workVector */
1467: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1468: mat->num_rows, mat->num_cols,
1469: mat->num_entries, matstruct->alpha, matstruct->descr,
1470: mat->values->data().get(), mat->row_offsets->data().get(),
1471: mat->column_indices->data().get(), xarray, matstruct->beta,
1472: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1473: } else {
1474: #if CUDA_VERSION>=4020
1475: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1476: if (cusparsestruct->workVector->size()) {
1477: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478: matstruct->alpha, matstruct->descr, hybMat,
1479: xarray, matstruct->beta,
1480: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1481: }
1482: #endif
1483: }
1485: /* scatter the data from the temporary into the full vector with a += operation */
1486: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1487: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1488: VecCUDAPlusEquals());
1489: VecCUDARestoreArrayRead(xx,&xarray);
1490: VecCUDARestoreArrayReadWrite(zz,&zarray);
1492: } catch(char *ex) {
1493: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1494: }
1495: WaitForGPU();CHKERRCUDA(ierr);
1496: PetscLogFlops(2.0*a->nz);
1497: return(0);
1498: }
1500: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1501: {
1502: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1503: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1504: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1505: thrust::device_ptr<PetscScalar> zptr;
1506: const PetscScalar *xarray;
1507: PetscScalar *zarray;
1508: PetscErrorCode ierr;
1509: cusparseStatus_t stat;
1512: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1513: MatSeqAIJCUSPARSECopyToGPU(A);
1514: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1515: if (!matstructT) {
1516: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1517: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1518: }
1520: try {
1521: VecCopy_SeqCUDA(yy,zz);
1522: VecCUDAGetArrayRead(xx,&xarray);
1523: VecCUDAGetArrayReadWrite(zz,&zarray);
1524: zptr = thrust::device_pointer_cast(zarray);
1526: /* multiply add with matrix transpose */
1527: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1528: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1529: /* here we need to be careful to set the number of rows in the multiply to the
1530: number of compressed rows in the matrix ... which is equivalent to the
1531: size of the workVector */
1532: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1533: mat->num_rows, mat->num_cols,
1534: mat->num_entries, matstructT->alpha, matstructT->descr,
1535: mat->values->data().get(), mat->row_offsets->data().get(),
1536: mat->column_indices->data().get(), xarray, matstructT->beta,
1537: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1538: } else {
1539: #if CUDA_VERSION>=4020
1540: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1541: if (cusparsestruct->workVector->size()) {
1542: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1543: matstructT->alpha, matstructT->descr, hybMat,
1544: xarray, matstructT->beta,
1545: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1546: }
1547: #endif
1548: }
1550: /* scatter the data from the temporary into the full vector with a += operation */
1551: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1552: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1553: VecCUDAPlusEquals());
1555: VecCUDARestoreArrayRead(xx,&xarray);
1556: VecCUDARestoreArrayReadWrite(zz,&zarray);
1558: } catch(char *ex) {
1559: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1560: }
1561: WaitForGPU();CHKERRCUDA(ierr);
1562: PetscLogFlops(2.0*a->nz);
1563: return(0);
1564: }
1566: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1567: {
1571: MatAssemblyEnd_SeqAIJ(A,mode);
1572: if (A->factortype==MAT_FACTOR_NONE) {
1573: MatSeqAIJCUSPARSECopyToGPU(A);
1574: }
1575: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1576: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1577: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1578: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1579: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1580: return(0);
1581: }
1583: /* --------------------------------------------------------------------------------*/
1584: /*@
1585: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1586: (the default parallel PETSc format). This matrix will ultimately pushed down
1587: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1588: assembly performance the user should preallocate the matrix storage by setting
1589: the parameter nz (or the array nnz). By setting these parameters accurately,
1590: performance during matrix assembly can be increased by more than a factor of 50.
1592: Collective on MPI_Comm
1594: Input Parameters:
1595: + comm - MPI communicator, set to PETSC_COMM_SELF
1596: . m - number of rows
1597: . n - number of columns
1598: . nz - number of nonzeros per row (same for all rows)
1599: - nnz - array containing the number of nonzeros in the various rows
1600: (possibly different for each row) or NULL
1602: Output Parameter:
1603: . A - the matrix
1605: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1606: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1607: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1609: Notes:
1610: If nnz is given then nz is ignored
1612: The AIJ format (also called the Yale sparse matrix format or
1613: compressed row storage), is fully compatible with standard Fortran 77
1614: storage. That is, the stored row and column indices can begin at
1615: either one (as in Fortran) or zero. See the users' manual for details.
1617: Specify the preallocated storage with either nz or nnz (not both).
1618: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1619: allocation. For large problems you MUST preallocate memory or you
1620: will get TERRIBLE performance, see the users' manual chapter on matrices.
1622: By default, this format uses inodes (identical nodes) when possible, to
1623: improve numerical efficiency of matrix-vector products and solves. We
1624: search for consecutive rows with the same nonzero structure, thereby
1625: reusing matrix information to achieve increased efficiency.
1627: Level: intermediate
1629: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1630: @*/
1631: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1632: {
1636: MatCreate(comm,A);
1637: MatSetSizes(*A,m,n,m,n);
1638: MatSetType(*A,MATSEQAIJCUSPARSE);
1639: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1640: return(0);
1641: }
1643: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1644: {
1645: PetscErrorCode ierr;
1648: if (A->factortype==MAT_FACTOR_NONE) {
1649: if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1650: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1651: }
1652: } else {
1653: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1654: }
1655: MatDestroy_SeqAIJ(A);
1656: return(0);
1657: }
1659: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1660: {
1662: Mat C;
1663: cusparseStatus_t stat;
1664: cusparseHandle_t handle=0;
1667: MatDuplicate_SeqAIJ(A,cpvalues,B);
1668: C = *B;
1669: /* inject CUSPARSE-specific stuff */
1670: if (C->factortype==MAT_FACTOR_NONE) {
1671: /* you cannot check the inode.use flag here since the matrix was just created.
1672: now build a GPU matrix data structure */
1673: C->spptr = new Mat_SeqAIJCUSPARSE;
1674: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1675: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1676: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1677: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1678: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1679: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0;
1680: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1681: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1682: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1683: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1684: } else {
1685: /* NEXT, set the pointers to the triangular factors */
1686: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1687: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1688: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1689: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1690: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1691: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1692: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1693: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1694: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1695: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1696: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1697: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1698: }
1700: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1701: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1702: C->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1703: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1704: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1705: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1706: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1707: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1708: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1710: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1712: C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1714: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1715: return(0);
1716: }
1718: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1719: {
1721: cusparseStatus_t stat;
1722: cusparseHandle_t handle=0;
1725: MatCreate_SeqAIJ(B);
1726: if (B->factortype==MAT_FACTOR_NONE) {
1727: /* you cannot check the inode.use flag here since the matrix was just created.
1728: now build a GPU matrix data structure */
1729: B->spptr = new Mat_SeqAIJCUSPARSE;
1730: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1731: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1732: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1733: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1734: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1735: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0;
1736: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1737: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1738: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1739: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1740: } else {
1741: /* NEXT, set the pointers to the triangular factors */
1742: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1743: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1744: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1745: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1746: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1747: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1748: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1749: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1750: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0;
1751: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1752: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1753: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1754: }
1756: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1757: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1758: B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE;
1759: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1760: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1761: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1762: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1763: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1764: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1766: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1768: B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1770: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1771: return(0);
1772: }
1774: /*MC
1775: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1777: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1778: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1779: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1781: Options Database Keys:
1782: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1783: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1784: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1786: Level: beginner
1788: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1789: M*/
1791: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1794: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1795: {
1799: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1800: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1801: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1802: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1803: return(0);
1804: }
1807: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1808: {
1809: cusparseStatus_t stat;
1810: cusparseHandle_t handle;
1813: if (*cusparsestruct) {
1814: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1815: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1816: delete (*cusparsestruct)->workVector;
1817: if (handle = (*cusparsestruct)->handle) {
1818: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1819: }
1820: delete *cusparsestruct;
1821: *cusparsestruct = 0;
1822: }
1823: return(0);
1824: }
1826: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1827: {
1829: if (*mat) {
1830: delete (*mat)->values;
1831: delete (*mat)->column_indices;
1832: delete (*mat)->row_offsets;
1833: delete *mat;
1834: *mat = 0;
1835: }
1836: return(0);
1837: }
1839: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1840: {
1841: cusparseStatus_t stat;
1842: PetscErrorCode ierr;
1845: if (*trifactor) {
1846: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1847: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1848: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1849: delete *trifactor;
1850: *trifactor = 0;
1851: }
1852: return(0);
1853: }
1855: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1856: {
1857: CsrMatrix *mat;
1858: cusparseStatus_t stat;
1859: cudaError_t err;
1862: if (*matstruct) {
1863: if ((*matstruct)->mat) {
1864: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1865: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1866: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1867: } else {
1868: mat = (CsrMatrix*)(*matstruct)->mat;
1869: CsrMatrix_Destroy(&mat);
1870: }
1871: }
1872: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1873: delete (*matstruct)->cprowIndices;
1874: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1875: if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1876: delete *matstruct;
1877: *matstruct = 0;
1878: }
1879: return(0);
1880: }
1882: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1883: {
1884: cusparseHandle_t handle;
1885: cusparseStatus_t stat;
1888: if (*trifactors) {
1889: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1890: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1891: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1892: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1893: delete (*trifactors)->rpermIndices;
1894: delete (*trifactors)->cpermIndices;
1895: delete (*trifactors)->workVector;
1896: if (handle = (*trifactors)->handle) {
1897: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1898: }
1899: delete *trifactors;
1900: *trifactors = 0;
1901: }
1902: return(0);
1903: }