Actual source code: aijcusparse.cu
petsc-3.11.4 2019-09-28
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42: {
43: cusparseStatus_t stat;
44: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47: cusparsestruct->stream = stream;
48: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49: return(0);
50: }
52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53: {
54: cusparseStatus_t stat;
55: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58: if (cusparsestruct->handle != handle) {
59: if (cusparsestruct->handle) {
60: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61: }
62: cusparsestruct->handle = handle;
63: }
64: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65: return(0);
66: }
68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69: {
70: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72: if (cusparsestruct->handle)
73: cusparsestruct->handle = 0;
74: return(0);
75: }
77: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78: {
80: *type = MATSOLVERCUSPARSE;
81: return(0);
82: }
84: /*MC
85: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90: algorithms are not recommended. This class does NOT support direct solver operations.
92: Level: beginner
94: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95: M*/
97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98: {
100: PetscInt n = A->rmap->n;
103: MatCreate(PetscObjectComm((PetscObject)A),B);
104: (*B)->factortype = ftype;
105: MatSetSizes(*B,n,n,n,n);
106: MatSetType(*B,MATSEQAIJCUSPARSE);
108: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109: MatSetBlockSizesFromMats(*B,A,A);
110: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127: #if CUDA_VERSION>=4020
128: switch (op) {
129: case MAT_CUSPARSE_MULT:
130: cusparsestruct->format = format;
131: break;
132: case MAT_CUSPARSE_ALL:
133: cusparsestruct->format = format;
134: break;
135: default:
136: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137: }
138: #else
139: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141: return(0);
142: }
144: /*@
145: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146: operation. Only the MatMult operation can use different GPU storage formats
147: for MPIAIJCUSPARSE matrices.
148: Not Collective
150: Input Parameters:
151: + A - Matrix of type SEQAIJCUSPARSE
152: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
155: Output Parameter:
157: Level: intermediate
159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {
167: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168: return(0);
169: }
171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173: PetscErrorCode ierr;
174: MatCUSPARSEStorageFormat format;
175: PetscBool flg;
176: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
179: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180: if (A->factortype==MAT_FACTOR_NONE) {
181: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
183: if (flg) {
184: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
185: }
186: }
187: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
189: if (flg) {
190: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
191: }
192: PetscOptionsTail();
193: return(0);
195: }
197: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198: {
202: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
203: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204: return(0);
205: }
207: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208: {
212: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
213: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214: return(0);
215: }
217: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218: {
222: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
223: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224: return(0);
225: }
227: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228: {
232: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
233: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234: return(0);
235: }
237: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238: {
239: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: PetscInt n = A->rmap->n;
241: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243: cusparseStatus_t stat;
244: const PetscInt *ai = a->i,*aj = a->j,*vi;
245: const MatScalar *aa = a->a,*v;
246: PetscInt *AiLo, *AjLo;
247: PetscScalar *AALo;
248: PetscInt i,nz, nzLower, offset, rowOffset;
249: PetscErrorCode ierr;
252: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253: try {
254: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255: nzLower=n+ai[n]-ai[1];
257: /* Allocate Space for the lower triangular matrix */
258: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
262: /* Fill the lower triangular matrix */
263: AiLo[0] = (PetscInt) 0;
264: AiLo[n] = nzLower;
265: AjLo[0] = (PetscInt) 0;
266: AALo[0] = (MatScalar) 1.0;
267: v = aa;
268: vi = aj;
269: offset = 1;
270: rowOffset= 1;
271: for (i=1; i<n; i++) {
272: nz = ai[i+1] - ai[i];
273: /* additional 1 for the term on the diagonal */
274: AiLo[i] = rowOffset;
275: rowOffset += nz+1;
277: PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
278: PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));
280: offset += nz;
281: AjLo[offset] = (PetscInt) i;
282: AALo[offset] = (MatScalar) 1.0;
283: offset += 1;
285: v += nz;
286: vi += nz;
287: }
289: /* allocate space for the triangular factor information */
290: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
292: /* Create the matrix description */
293: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
299: /* Create the solve analysis information */
300: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
302: /* set the operation */
303: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
305: /* set the matrix */
306: loTriFactor->csrMat = new CsrMatrix;
307: loTriFactor->csrMat->num_rows = n;
308: loTriFactor->csrMat->num_cols = n;
309: loTriFactor->csrMat->num_entries = nzLower;
311: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
314: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
317: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
320: /* perform the solve analysis */
321: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
326: /* assign the pointer. Is this really necessary? */
327: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
329: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331: cudaFreeHost(AALo);CHKERRCUDA(ierr);
332: } catch(char *ex) {
333: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334: }
335: }
336: return(0);
337: }
339: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340: {
341: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
342: PetscInt n = A->rmap->n;
343: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345: cusparseStatus_t stat;
346: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
347: const MatScalar *aa = a->a,*v;
348: PetscInt *AiUp, *AjUp;
349: PetscScalar *AAUp;
350: PetscInt i,nz, nzUpper, offset;
351: PetscErrorCode ierr;
354: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355: try {
356: /* next, figure out the number of nonzeros in the upper triangular matrix. */
357: nzUpper = adiag[0]-adiag[n];
359: /* Allocate Space for the upper triangular matrix */
360: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364: /* Fill the upper triangular matrix */
365: AiUp[0]=(PetscInt) 0;
366: AiUp[n]=nzUpper;
367: offset = nzUpper;
368: for (i=n-1; i>=0; i--) {
369: v = aa + adiag[i+1] + 1;
370: vi = aj + adiag[i+1] + 1;
372: /* number of elements NOT on the diagonal */
373: nz = adiag[i] - adiag[i+1]-1;
375: /* decrement the offset */
376: offset -= (nz+1);
378: /* first, set the diagonal elements */
379: AjUp[offset] = (PetscInt) i;
380: AAUp[offset] = (MatScalar)1./v[nz];
381: AiUp[i] = AiUp[i+1] - (nz+1);
383: PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
384: PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
385: }
387: /* allocate space for the triangular factor information */
388: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390: /* Create the matrix description */
391: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397: /* Create the solve analysis information */
398: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400: /* set the operation */
401: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403: /* set the matrix */
404: upTriFactor->csrMat = new CsrMatrix;
405: upTriFactor->csrMat->num_rows = n;
406: upTriFactor->csrMat->num_cols = n;
407: upTriFactor->csrMat->num_entries = nzUpper;
409: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418: /* perform the solve analysis */
419: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424: /* assign the pointer. Is this really necessary? */
425: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430: } catch(char *ex) {
431: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432: }
433: }
434: return(0);
435: }
437: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438: {
439: PetscErrorCode ierr;
440: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
441: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442: IS isrow = a->row,iscol = a->icol;
443: PetscBool row_identity,col_identity;
444: const PetscInt *r,*c;
445: PetscInt n = A->rmap->n;
448: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
449: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
451: cusparseTriFactors->workVector = new THRUSTARRAY(n);
452: cusparseTriFactors->nnz=a->nz;
454: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455: /*lower triangular indices */
456: ISGetIndices(isrow,&r);
457: ISIdentity(isrow,&row_identity);
458: if (!row_identity) {
459: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460: cusparseTriFactors->rpermIndices->assign(r, r+n);
461: }
462: ISRestoreIndices(isrow,&r);
464: /*upper triangular indices */
465: ISGetIndices(iscol,&c);
466: ISIdentity(iscol,&col_identity);
467: if (!col_identity) {
468: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469: cusparseTriFactors->cpermIndices->assign(c, c+n);
470: }
471: ISRestoreIndices(iscol,&c);
472: return(0);
473: }
475: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476: {
477: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
478: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481: cusparseStatus_t stat;
482: PetscErrorCode ierr;
483: PetscInt *AiUp, *AjUp;
484: PetscScalar *AAUp;
485: PetscScalar *AALo;
486: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
488: const PetscInt *ai = b->i,*aj = b->j,*vj;
489: const MatScalar *aa = b->a,*v;
492: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
493: try {
494: /* Allocate Space for the upper triangular matrix */
495: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500: /* Fill the upper triangular matrix */
501: AiUp[0]=(PetscInt) 0;
502: AiUp[n]=nzUpper;
503: offset = 0;
504: for (i=0; i<n; i++) {
505: /* set the pointers */
506: v = aa + ai[i];
507: vj = aj + ai[i];
508: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
510: /* first, set the diagonal elements */
511: AjUp[offset] = (PetscInt) i;
512: AAUp[offset] = (MatScalar)1.0/v[nz];
513: AiUp[i] = offset;
514: AALo[offset] = (MatScalar)1.0/v[nz];
516: offset+=1;
517: if (nz>0) {
518: PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
519: PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
520: for (j=offset; j<offset+nz; j++) {
521: AAUp[j] = -AAUp[j];
522: AALo[j] = AAUp[j]/v[nz];
523: }
524: offset+=nz;
525: }
526: }
528: /* allocate space for the triangular factor information */
529: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
531: /* Create the matrix description */
532: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
538: /* Create the solve analysis information */
539: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
541: /* set the operation */
542: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
544: /* set the matrix */
545: upTriFactor->csrMat = new CsrMatrix;
546: upTriFactor->csrMat->num_rows = A->rmap->n;
547: upTriFactor->csrMat->num_cols = A->cmap->n;
548: upTriFactor->csrMat->num_entries = a->nz;
550: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
553: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
556: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
559: /* perform the solve analysis */
560: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
565: /* assign the pointer. Is this really necessary? */
566: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
568: /* allocate space for the triangular factor information */
569: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
571: /* Create the matrix description */
572: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
578: /* Create the solve analysis information */
579: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
581: /* set the operation */
582: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
584: /* set the matrix */
585: loTriFactor->csrMat = new CsrMatrix;
586: loTriFactor->csrMat->num_rows = A->rmap->n;
587: loTriFactor->csrMat->num_cols = A->cmap->n;
588: loTriFactor->csrMat->num_entries = a->nz;
590: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
593: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
596: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
599: /* perform the solve analysis */
600: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
605: /* assign the pointer. Is this really necessary? */
606: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
608: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
609: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612: cudaFreeHost(AALo);CHKERRCUDA(ierr);
613: } catch(char *ex) {
614: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615: }
616: }
617: return(0);
618: }
620: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621: {
622: PetscErrorCode ierr;
623: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
624: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625: IS ip = a->row;
626: const PetscInt *rip;
627: PetscBool perm_identity;
628: PetscInt n = A->rmap->n;
631: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
632: cusparseTriFactors->workVector = new THRUSTARRAY(n);
633: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
635: /*lower triangular indices */
636: ISGetIndices(ip,&rip);
637: ISIdentity(ip,&perm_identity);
638: if (!perm_identity) {
639: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
640: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
641: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
642: cusparseTriFactors->cpermIndices->assign(rip, rip+n);
643: }
644: ISRestoreIndices(ip,&rip);
645: return(0);
646: }
648: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
649: {
650: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
651: IS isrow = b->row,iscol = b->col;
652: PetscBool row_identity,col_identity;
656: MatLUFactorNumeric_SeqAIJ(B,A,info);
657: /* determine which version of MatSolve needs to be used. */
658: ISIdentity(isrow,&row_identity);
659: ISIdentity(iscol,&col_identity);
660: if (row_identity && col_identity) {
661: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
662: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
663: } else {
664: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
665: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
666: }
668: /* get the triangular factors */
669: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
670: return(0);
671: }
673: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
674: {
675: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
676: IS ip = b->row;
677: PetscBool perm_identity;
681: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
683: /* determine which version of MatSolve needs to be used. */
684: ISIdentity(ip,&perm_identity);
685: if (perm_identity) {
686: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
687: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
688: } else {
689: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
690: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
691: }
693: /* get the triangular factors */
694: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
695: return(0);
696: }
698: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
699: {
700: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
701: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
702: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
703: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
704: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
705: cusparseStatus_t stat;
706: cusparseIndexBase_t indexBase;
707: cusparseMatrixType_t matrixType;
708: cusparseFillMode_t fillMode;
709: cusparseDiagType_t diagType;
713: /*********************************************/
714: /* Now the Transpose of the Lower Tri Factor */
715: /*********************************************/
717: /* allocate space for the transpose of the lower triangular factor */
718: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
720: /* set the matrix descriptors of the lower triangular factor */
721: matrixType = cusparseGetMatType(loTriFactor->descr);
722: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
723: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
724: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
725: diagType = cusparseGetMatDiagType(loTriFactor->descr);
727: /* Create the matrix description */
728: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
729: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
730: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
731: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
732: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
734: /* Create the solve analysis information */
735: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
737: /* set the operation */
738: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
740: /* allocate GPU space for the CSC of the lower triangular factor*/
741: loTriFactorT->csrMat = new CsrMatrix;
742: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
743: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
744: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
745: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
746: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
747: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
749: /* compute the transpose of the lower triangular factor, i.e. the CSC */
750: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
751: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
752: loTriFactor->csrMat->values->data().get(),
753: loTriFactor->csrMat->row_offsets->data().get(),
754: loTriFactor->csrMat->column_indices->data().get(),
755: loTriFactorT->csrMat->values->data().get(),
756: loTriFactorT->csrMat->column_indices->data().get(),
757: loTriFactorT->csrMat->row_offsets->data().get(),
758: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
760: /* perform the solve analysis on the transposed matrix */
761: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
762: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
763: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
764: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
765: loTriFactorT->solveInfo);CHKERRCUDA(stat);
767: /* assign the pointer. Is this really necessary? */
768: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
770: /*********************************************/
771: /* Now the Transpose of the Upper Tri Factor */
772: /*********************************************/
774: /* allocate space for the transpose of the upper triangular factor */
775: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
777: /* set the matrix descriptors of the upper triangular factor */
778: matrixType = cusparseGetMatType(upTriFactor->descr);
779: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
780: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
781: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
782: diagType = cusparseGetMatDiagType(upTriFactor->descr);
784: /* Create the matrix description */
785: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
786: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
787: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
788: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
789: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
791: /* Create the solve analysis information */
792: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
794: /* set the operation */
795: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
797: /* allocate GPU space for the CSC of the upper triangular factor*/
798: upTriFactorT->csrMat = new CsrMatrix;
799: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
800: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
801: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
802: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
803: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
804: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
806: /* compute the transpose of the upper triangular factor, i.e. the CSC */
807: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
808: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
809: upTriFactor->csrMat->values->data().get(),
810: upTriFactor->csrMat->row_offsets->data().get(),
811: upTriFactor->csrMat->column_indices->data().get(),
812: upTriFactorT->csrMat->values->data().get(),
813: upTriFactorT->csrMat->column_indices->data().get(),
814: upTriFactorT->csrMat->row_offsets->data().get(),
815: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
817: /* perform the solve analysis on the transposed matrix */
818: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
819: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
820: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
821: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
822: upTriFactorT->solveInfo);CHKERRCUDA(stat);
824: /* assign the pointer. Is this really necessary? */
825: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
826: return(0);
827: }
829: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
830: {
831: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
832: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
833: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
834: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
835: cusparseStatus_t stat;
836: cusparseIndexBase_t indexBase;
837: cudaError_t err;
841: /* allocate space for the triangular factor information */
842: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
843: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
844: indexBase = cusparseGetMatIndexBase(matstruct->descr);
845: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
846: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
848: /* set alpha and beta */
849: err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
850: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
851: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
852: err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
853: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
855: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
857: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
858: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
859: CsrMatrix *matrixT= new CsrMatrix;
860: matrixT->num_rows = A->cmap->n;
861: matrixT->num_cols = A->rmap->n;
862: matrixT->num_entries = a->nz;
863: matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
864: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
865: matrixT->values = new THRUSTARRAY(a->nz);
867: /* compute the transpose of the upper triangular factor, i.e. the CSC */
868: indexBase = cusparseGetMatIndexBase(matstruct->descr);
869: stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
870: matrix->num_cols, matrix->num_entries,
871: matrix->values->data().get(),
872: matrix->row_offsets->data().get(),
873: matrix->column_indices->data().get(),
874: matrixT->values->data().get(),
875: matrixT->column_indices->data().get(),
876: matrixT->row_offsets->data().get(),
877: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
879: /* assign the pointer */
880: matstructT->mat = matrixT;
882: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
883: #if CUDA_VERSION>=5000
884: /* First convert HYB to CSR */
885: CsrMatrix *temp= new CsrMatrix;
886: temp->num_rows = A->rmap->n;
887: temp->num_cols = A->cmap->n;
888: temp->num_entries = a->nz;
889: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
890: temp->column_indices = new THRUSTINTARRAY32(a->nz);
891: temp->values = new THRUSTARRAY(a->nz);
894: stat = cusparse_hyb2csr(cusparsestruct->handle,
895: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
896: temp->values->data().get(),
897: temp->row_offsets->data().get(),
898: temp->column_indices->data().get());CHKERRCUDA(stat);
900: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
901: CsrMatrix *tempT= new CsrMatrix;
902: tempT->num_rows = A->rmap->n;
903: tempT->num_cols = A->cmap->n;
904: tempT->num_entries = a->nz;
905: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
907: tempT->values = new THRUSTARRAY(a->nz);
909: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
910: temp->num_cols, temp->num_entries,
911: temp->values->data().get(),
912: temp->row_offsets->data().get(),
913: temp->column_indices->data().get(),
914: tempT->values->data().get(),
915: tempT->column_indices->data().get(),
916: tempT->row_offsets->data().get(),
917: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919: /* Last, convert CSC to HYB */
920: cusparseHybMat_t hybMat;
921: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
922: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
923: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
924: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
925: matstructT->descr, tempT->values->data().get(),
926: tempT->row_offsets->data().get(),
927: tempT->column_indices->data().get(),
928: hybMat, 0, partition);CHKERRCUDA(stat);
930: /* assign the pointer */
931: matstructT->mat = hybMat;
933: /* delete temporaries */
934: if (tempT) {
935: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
936: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
937: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
938: delete (CsrMatrix*) tempT;
939: }
940: if (temp) {
941: if (temp->values) delete (THRUSTARRAY*) temp->values;
942: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
943: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
944: delete (CsrMatrix*) temp;
945: }
946: #else
947: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
948: #endif
949: }
950: /* assign the compressed row indices */
951: matstructT->cprowIndices = new THRUSTINTARRAY;
952: matstructT->cprowIndices->resize(A->cmap->n);
953: thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
955: /* assign the pointer */
956: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
957: return(0);
958: }
960: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
961: {
962: PetscInt n = xx->map->n;
963: const PetscScalar *barray;
964: PetscScalar *xarray;
965: thrust::device_ptr<const PetscScalar> bGPU;
966: thrust::device_ptr<PetscScalar> xGPU;
967: cusparseStatus_t stat;
968: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
969: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
970: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
971: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
972: PetscErrorCode ierr;
975: /* Analyze the matrix and create the transpose ... on the fly */
976: if (!loTriFactorT && !upTriFactorT) {
977: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
978: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980: }
982: /* Get the GPU pointers */
983: VecCUDAGetArrayWrite(xx,&xarray);
984: VecCUDAGetArrayRead(bb,&barray);
985: xGPU = thrust::device_pointer_cast(xarray);
986: bGPU = thrust::device_pointer_cast(barray);
988: /* First, reorder with the row permutation */
989: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
990: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
991: xGPU);
993: /* First, solve U */
994: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
995: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
996: upTriFactorT->csrMat->values->data().get(),
997: upTriFactorT->csrMat->row_offsets->data().get(),
998: upTriFactorT->csrMat->column_indices->data().get(),
999: upTriFactorT->solveInfo,
1000: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1002: /* Then, solve L */
1003: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1004: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1005: loTriFactorT->csrMat->values->data().get(),
1006: loTriFactorT->csrMat->row_offsets->data().get(),
1007: loTriFactorT->csrMat->column_indices->data().get(),
1008: loTriFactorT->solveInfo,
1009: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1011: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1012: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1013: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1014: tempGPU->begin());
1016: /* Copy the temporary to the full solution. */
1017: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1019: /* restore */
1020: VecCUDARestoreArrayRead(bb,&barray);
1021: VecCUDARestoreArrayWrite(xx,&xarray);
1022: WaitForGPU();CHKERRCUDA(ierr);
1024: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1025: return(0);
1026: }
1028: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1029: {
1030: const PetscScalar *barray;
1031: PetscScalar *xarray;
1032: cusparseStatus_t stat;
1033: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1034: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1035: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1036: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1037: PetscErrorCode ierr;
1040: /* Analyze the matrix and create the transpose ... on the fly */
1041: if (!loTriFactorT && !upTriFactorT) {
1042: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1043: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1044: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1045: }
1047: /* Get the GPU pointers */
1048: VecCUDAGetArrayWrite(xx,&xarray);
1049: VecCUDAGetArrayRead(bb,&barray);
1051: /* First, solve U */
1052: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1053: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1054: upTriFactorT->csrMat->values->data().get(),
1055: upTriFactorT->csrMat->row_offsets->data().get(),
1056: upTriFactorT->csrMat->column_indices->data().get(),
1057: upTriFactorT->solveInfo,
1058: barray, tempGPU->data().get());CHKERRCUDA(stat);
1060: /* Then, solve L */
1061: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1062: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1063: loTriFactorT->csrMat->values->data().get(),
1064: loTriFactorT->csrMat->row_offsets->data().get(),
1065: loTriFactorT->csrMat->column_indices->data().get(),
1066: loTriFactorT->solveInfo,
1067: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1069: /* restore */
1070: VecCUDARestoreArrayRead(bb,&barray);
1071: VecCUDARestoreArrayWrite(xx,&xarray);
1072: WaitForGPU();CHKERRCUDA(ierr);
1073: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1074: return(0);
1075: }
1077: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1078: {
1079: const PetscScalar *barray;
1080: PetscScalar *xarray;
1081: thrust::device_ptr<const PetscScalar> bGPU;
1082: thrust::device_ptr<PetscScalar> xGPU;
1083: cusparseStatus_t stat;
1084: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1085: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1086: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1087: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1088: PetscErrorCode ierr;
1092: /* Get the GPU pointers */
1093: VecCUDAGetArrayWrite(xx,&xarray);
1094: VecCUDAGetArrayRead(bb,&barray);
1095: xGPU = thrust::device_pointer_cast(xarray);
1096: bGPU = thrust::device_pointer_cast(barray);
1098: /* First, reorder with the row permutation */
1099: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1100: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1101: xGPU);
1103: /* Next, solve L */
1104: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1105: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1106: loTriFactor->csrMat->values->data().get(),
1107: loTriFactor->csrMat->row_offsets->data().get(),
1108: loTriFactor->csrMat->column_indices->data().get(),
1109: loTriFactor->solveInfo,
1110: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1112: /* Then, solve U */
1113: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1114: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1115: upTriFactor->csrMat->values->data().get(),
1116: upTriFactor->csrMat->row_offsets->data().get(),
1117: upTriFactor->csrMat->column_indices->data().get(),
1118: upTriFactor->solveInfo,
1119: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1121: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1122: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1123: thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1124: tempGPU->begin());
1126: /* Copy the temporary to the full solution. */
1127: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1129: VecCUDARestoreArrayRead(bb,&barray);
1130: VecCUDARestoreArrayWrite(xx,&xarray);
1131: WaitForGPU();CHKERRCUDA(ierr);
1132: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1133: return(0);
1134: }
1136: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1137: {
1138: const PetscScalar *barray;
1139: PetscScalar *xarray;
1140: cusparseStatus_t stat;
1141: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1142: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1143: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1144: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1145: PetscErrorCode ierr;
1148: /* Get the GPU pointers */
1149: VecCUDAGetArrayWrite(xx,&xarray);
1150: VecCUDAGetArrayRead(bb,&barray);
1152: /* First, solve L */
1153: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155: loTriFactor->csrMat->values->data().get(),
1156: loTriFactor->csrMat->row_offsets->data().get(),
1157: loTriFactor->csrMat->column_indices->data().get(),
1158: loTriFactor->solveInfo,
1159: barray, tempGPU->data().get());CHKERRCUDA(stat);
1161: /* Next, solve U */
1162: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164: upTriFactor->csrMat->values->data().get(),
1165: upTriFactor->csrMat->row_offsets->data().get(),
1166: upTriFactor->csrMat->column_indices->data().get(),
1167: upTriFactor->solveInfo,
1168: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1170: VecCUDARestoreArrayRead(bb,&barray);
1171: VecCUDARestoreArrayWrite(xx,&xarray);
1172: WaitForGPU();CHKERRCUDA(ierr);
1173: PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1174: return(0);
1175: }
1177: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1178: {
1180: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1181: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1182: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1183: PetscInt m = A->rmap->n,*ii,*ridx;
1184: PetscErrorCode ierr;
1185: cusparseStatus_t stat;
1186: cudaError_t err;
1189: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1190: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1191: if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1192: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1193: /* copy values only */
1194: matrix->values->assign(a->a, a->a+a->nz);
1195: } else {
1196: MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1197: try {
1198: cusparsestruct->nonzerorow=0;
1199: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1201: if (a->compressedrow.use) {
1202: m = a->compressedrow.nrows;
1203: ii = a->compressedrow.i;
1204: ridx = a->compressedrow.rindex;
1205: } else {
1206: /* Forcing compressed row on the GPU */
1207: int k=0;
1208: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1209: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1210: ii[0]=0;
1211: for (int j = 0; j<m; j++) {
1212: if ((a->i[j+1]-a->i[j])>0) {
1213: ii[k] = a->i[j];
1214: ridx[k]= j;
1215: k++;
1216: }
1217: }
1218: ii[cusparsestruct->nonzerorow] = a->nz;
1219: m = cusparsestruct->nonzerorow;
1220: }
1222: /* allocate space for the triangular factor information */
1223: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1224: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1225: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1226: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1228: err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
1229: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1230: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1231: err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1232: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1234: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1236: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1237: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1238: /* set the matrix */
1239: CsrMatrix *matrix= new CsrMatrix;
1240: matrix->num_rows = m;
1241: matrix->num_cols = A->cmap->n;
1242: matrix->num_entries = a->nz;
1243: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1244: matrix->row_offsets->assign(ii, ii + m+1);
1246: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1247: matrix->column_indices->assign(a->j, a->j+a->nz);
1249: matrix->values = new THRUSTARRAY(a->nz);
1250: matrix->values->assign(a->a, a->a+a->nz);
1252: /* assign the pointer */
1253: matstruct->mat = matrix;
1255: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1256: #if CUDA_VERSION>=4020
1257: CsrMatrix *matrix= new CsrMatrix;
1258: matrix->num_rows = m;
1259: matrix->num_cols = A->cmap->n;
1260: matrix->num_entries = a->nz;
1261: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1262: matrix->row_offsets->assign(ii, ii + m+1);
1264: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1265: matrix->column_indices->assign(a->j, a->j+a->nz);
1267: matrix->values = new THRUSTARRAY(a->nz);
1268: matrix->values->assign(a->a, a->a+a->nz);
1270: cusparseHybMat_t hybMat;
1271: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1272: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1273: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1274: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1275: matstruct->descr, matrix->values->data().get(),
1276: matrix->row_offsets->data().get(),
1277: matrix->column_indices->data().get(),
1278: hybMat, 0, partition);CHKERRCUDA(stat);
1279: /* assign the pointer */
1280: matstruct->mat = hybMat;
1282: if (matrix) {
1283: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1284: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1285: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1286: delete (CsrMatrix*)matrix;
1287: }
1288: #endif
1289: }
1291: /* assign the compressed row indices */
1292: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1293: matstruct->cprowIndices->assign(ridx,ridx+m);
1295: /* assign the pointer */
1296: cusparsestruct->mat = matstruct;
1298: if (!a->compressedrow.use) {
1299: PetscFree(ii);
1300: PetscFree(ridx);
1301: }
1302: cusparsestruct->workVector = new THRUSTARRAY(m);
1303: } catch(char *ex) {
1304: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305: }
1306: cusparsestruct->nonzerostate = A->nonzerostate;
1307: }
1308: WaitForGPU();CHKERRCUDA(ierr);
1309: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1311: }
1312: return(0);
1313: }
1315: struct VecCUDAPlusEquals
1316: {
1317: template <typename Tuple>
1318: __host__ __device__
1319: void operator()(Tuple t)
1320: {
1321: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1322: }
1323: };
1325: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1326: {
1330: MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1331: return(0);
1332: }
1334: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1335: {
1336: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1337: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1338: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1339: const PetscScalar *xarray;
1340: PetscScalar *yarray;
1341: PetscErrorCode ierr;
1342: cusparseStatus_t stat;
1345: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1346: MatSeqAIJCUSPARSECopyToGPU(A);
1347: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1348: if (!matstructT) {
1349: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1350: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1351: }
1352: VecCUDAGetArrayRead(xx,&xarray);
1353: VecSet(yy,0);
1354: VecCUDAGetArrayWrite(yy,&yarray);
1356: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1357: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1358: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1359: mat->num_rows, mat->num_cols,
1360: mat->num_entries, matstructT->alpha, matstructT->descr,
1361: mat->values->data().get(), mat->row_offsets->data().get(),
1362: mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1363: yarray);CHKERRCUDA(stat);
1364: } else {
1365: #if CUDA_VERSION>=4020
1366: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1367: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1368: matstructT->alpha, matstructT->descr, hybMat,
1369: xarray, matstructT->beta_zero,
1370: yarray);CHKERRCUDA(stat);
1371: #endif
1372: }
1373: VecCUDARestoreArrayRead(xx,&xarray);
1374: VecCUDARestoreArrayWrite(yy,&yarray);
1375: if (!cusparsestruct->stream) {
1376: WaitForGPU();CHKERRCUDA(ierr);
1377: }
1378: PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1379: return(0);
1380: }
1383: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1384: {
1385: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1386: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1387: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1388: const PetscScalar *xarray;
1389: PetscScalar *zarray,*dptr,*beta;
1390: PetscErrorCode ierr;
1391: cusparseStatus_t stat;
1394: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1395: MatSeqAIJCUSPARSECopyToGPU(A);
1396: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1397: try {
1398: VecCUDAGetArrayRead(xx,&xarray);
1399: VecCUDAGetArrayReadWrite(zz,&zarray);
1400: dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1401: beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1403: /* csr_spmv is multiply add */
1404: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1405: /* here we need to be careful to set the number of rows in the multiply to the
1406: number of compressed rows in the matrix ... which is equivalent to the
1407: size of the workVector */
1408: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1409: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1410: mat->num_rows, mat->num_cols,
1411: mat->num_entries, matstruct->alpha, matstruct->descr,
1412: mat->values->data().get(), mat->row_offsets->data().get(),
1413: mat->column_indices->data().get(), xarray, beta,
1414: dptr);CHKERRCUDA(stat);
1415: } else {
1416: #if CUDA_VERSION>=4020
1417: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1418: if (cusparsestruct->workVector->size()) {
1419: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420: matstruct->alpha, matstruct->descr, hybMat,
1421: xarray, beta,
1422: dptr);CHKERRCUDA(stat);
1423: }
1424: #endif
1425: }
1427: if (yy) {
1428: if (dptr != zarray) {
1429: VecCopy_SeqCUDA(yy,zz);
1430: } else if (zz != yy) {
1431: VecAXPY_SeqCUDA(zz,1.0,yy);
1432: }
1433: } else if (dptr != zarray) {
1434: VecSet(zz,0);
1435: }
1436: /* scatter the data from the temporary into the full vector with a += operation */
1437: if (dptr != zarray) {
1438: thrust::device_ptr<PetscScalar> zptr;
1440: zptr = thrust::device_pointer_cast(zarray);
1441: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1442: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1443: VecCUDAPlusEquals());
1444: }
1445: VecCUDARestoreArrayRead(xx,&xarray);
1446: VecCUDARestoreArrayReadWrite(zz,&zarray);
1447: } catch(char *ex) {
1448: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1449: }
1450: if (!yy) { /* MatMult */
1451: if (!cusparsestruct->stream) {
1452: WaitForGPU();CHKERRCUDA(ierr);
1453: }
1454: }
1455: PetscLogFlops(2.0*a->nz);
1456: return(0);
1457: }
1459: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1460: {
1461: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1462: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1463: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1464: thrust::device_ptr<PetscScalar> zptr;
1465: const PetscScalar *xarray;
1466: PetscScalar *zarray;
1467: PetscErrorCode ierr;
1468: cusparseStatus_t stat;
1471: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1472: MatSeqAIJCUSPARSECopyToGPU(A);
1473: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1474: if (!matstructT) {
1475: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1476: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1477: }
1479: try {
1480: VecCopy_SeqCUDA(yy,zz);
1481: VecCUDAGetArrayRead(xx,&xarray);
1482: VecCUDAGetArrayReadWrite(zz,&zarray);
1483: zptr = thrust::device_pointer_cast(zarray);
1485: /* multiply add with matrix transpose */
1486: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1487: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1488: /* here we need to be careful to set the number of rows in the multiply to the
1489: number of compressed rows in the matrix ... which is equivalent to the
1490: size of the workVector */
1491: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1492: mat->num_rows, mat->num_cols,
1493: mat->num_entries, matstructT->alpha, matstructT->descr,
1494: mat->values->data().get(), mat->row_offsets->data().get(),
1495: mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1496: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1497: } else {
1498: #if CUDA_VERSION>=4020
1499: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1500: if (cusparsestruct->workVector->size()) {
1501: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1502: matstructT->alpha, matstructT->descr, hybMat,
1503: xarray, matstructT->beta_zero,
1504: cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1505: }
1506: #endif
1507: }
1509: /* scatter the data from the temporary into the full vector with a += operation */
1510: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1511: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1512: VecCUDAPlusEquals());
1514: VecCUDARestoreArrayRead(xx,&xarray);
1515: VecCUDARestoreArrayReadWrite(zz,&zarray);
1517: } catch(char *ex) {
1518: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1519: }
1520: WaitForGPU();CHKERRCUDA(ierr);
1521: PetscLogFlops(2.0*a->nz);
1522: return(0);
1523: }
1525: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1526: {
1530: MatAssemblyEnd_SeqAIJ(A,mode);
1531: if (A->factortype==MAT_FACTOR_NONE) {
1532: MatSeqAIJCUSPARSECopyToGPU(A);
1533: }
1534: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1535: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1536: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1537: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1538: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1539: return(0);
1540: }
1542: /* --------------------------------------------------------------------------------*/
1543: /*@
1544: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1545: (the default parallel PETSc format). This matrix will ultimately pushed down
1546: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1547: assembly performance the user should preallocate the matrix storage by setting
1548: the parameter nz (or the array nnz). By setting these parameters accurately,
1549: performance during matrix assembly can be increased by more than a factor of 50.
1551: Collective on MPI_Comm
1553: Input Parameters:
1554: + comm - MPI communicator, set to PETSC_COMM_SELF
1555: . m - number of rows
1556: . n - number of columns
1557: . nz - number of nonzeros per row (same for all rows)
1558: - nnz - array containing the number of nonzeros in the various rows
1559: (possibly different for each row) or NULL
1561: Output Parameter:
1562: . A - the matrix
1564: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1565: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1566: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1568: Notes:
1569: If nnz is given then nz is ignored
1571: The AIJ format (also called the Yale sparse matrix format or
1572: compressed row storage), is fully compatible with standard Fortran 77
1573: storage. That is, the stored row and column indices can begin at
1574: either one (as in Fortran) or zero. See the users' manual for details.
1576: Specify the preallocated storage with either nz or nnz (not both).
1577: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1578: allocation. For large problems you MUST preallocate memory or you
1579: will get TERRIBLE performance, see the users' manual chapter on matrices.
1581: By default, this format uses inodes (identical nodes) when possible, to
1582: improve numerical efficiency of matrix-vector products and solves. We
1583: search for consecutive rows with the same nonzero structure, thereby
1584: reusing matrix information to achieve increased efficiency.
1586: Level: intermediate
1588: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1589: @*/
1590: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1591: {
1595: MatCreate(comm,A);
1596: MatSetSizes(*A,m,n,m,n);
1597: MatSetType(*A,MATSEQAIJCUSPARSE);
1598: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1599: return(0);
1600: }
1602: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1603: {
1604: PetscErrorCode ierr;
1607: if (A->factortype==MAT_FACTOR_NONE) {
1608: if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1609: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1610: }
1611: } else {
1612: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1613: }
1614: MatDestroy_SeqAIJ(A);
1615: return(0);
1616: }
1618: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1619: {
1621: Mat C;
1622: cusparseStatus_t stat;
1623: cusparseHandle_t handle=0;
1626: MatDuplicate_SeqAIJ(A,cpvalues,B);
1627: C = *B;
1628: PetscFree(C->defaultvectype);
1629: PetscStrallocpy(VECCUDA,&C->defaultvectype);
1631: /* inject CUSPARSE-specific stuff */
1632: if (C->factortype==MAT_FACTOR_NONE) {
1633: /* you cannot check the inode.use flag here since the matrix was just created.
1634: now build a GPU matrix data structure */
1635: C->spptr = new Mat_SeqAIJCUSPARSE;
1636: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1637: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1638: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1639: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1640: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1641: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0;
1642: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1643: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1644: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1645: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1646: } else {
1647: /* NEXT, set the pointers to the triangular factors */
1648: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1649: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1650: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1651: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1652: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1653: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1654: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1655: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1656: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1657: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1658: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1659: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1660: }
1662: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1663: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1664: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1665: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1666: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1667: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1668: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1669: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1671: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1673: C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1675: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1676: return(0);
1677: }
1679: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1680: {
1682: cusparseStatus_t stat;
1683: cusparseHandle_t handle=0;
1686: PetscFree(B->defaultvectype);
1687: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1689: if (B->factortype==MAT_FACTOR_NONE) {
1690: /* you cannot check the inode.use flag here since the matrix was just created.
1691: now build a GPU matrix data structure */
1692: B->spptr = new Mat_SeqAIJCUSPARSE;
1693: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1694: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1695: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1696: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1697: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1698: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0;
1699: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1700: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1701: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1702: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1703: } else {
1704: /* NEXT, set the pointers to the triangular factors */
1705: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1706: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1707: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1708: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1709: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1710: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1711: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1712: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1713: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0;
1714: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1715: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1716: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1717: }
1719: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1720: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1721: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1722: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1723: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1724: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1725: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1726: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1728: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1730: B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1732: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1733: return(0);
1734: }
1736: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1737: {
1741: MatCreate_SeqAIJ(B);
1742: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1743: return(0);
1744: }
1746: /*MC
1747: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1749: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1750: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1751: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1753: Options Database Keys:
1754: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1755: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1756: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1758: Level: beginner
1760: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1761: M*/
1763: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1766: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1767: {
1771: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1772: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1773: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1774: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1775: return(0);
1776: }
1779: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1780: {
1781: cusparseStatus_t stat;
1782: cusparseHandle_t handle;
1785: if (*cusparsestruct) {
1786: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1787: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1788: delete (*cusparsestruct)->workVector;
1789: if (handle = (*cusparsestruct)->handle) {
1790: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1791: }
1792: delete *cusparsestruct;
1793: *cusparsestruct = 0;
1794: }
1795: return(0);
1796: }
1798: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1799: {
1801: if (*mat) {
1802: delete (*mat)->values;
1803: delete (*mat)->column_indices;
1804: delete (*mat)->row_offsets;
1805: delete *mat;
1806: *mat = 0;
1807: }
1808: return(0);
1809: }
1811: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1812: {
1813: cusparseStatus_t stat;
1814: PetscErrorCode ierr;
1817: if (*trifactor) {
1818: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1819: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1820: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1821: delete *trifactor;
1822: *trifactor = 0;
1823: }
1824: return(0);
1825: }
1827: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1828: {
1829: CsrMatrix *mat;
1830: cusparseStatus_t stat;
1831: cudaError_t err;
1834: if (*matstruct) {
1835: if ((*matstruct)->mat) {
1836: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1837: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1838: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1839: } else {
1840: mat = (CsrMatrix*)(*matstruct)->mat;
1841: CsrMatrix_Destroy(&mat);
1842: }
1843: }
1844: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1845: delete (*matstruct)->cprowIndices;
1846: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1847: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1848: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1849: delete *matstruct;
1850: *matstruct = 0;
1851: }
1852: return(0);
1853: }
1855: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1856: {
1857: cusparseHandle_t handle;
1858: cusparseStatus_t stat;
1861: if (*trifactors) {
1862: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1863: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1864: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1865: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1866: delete (*trifactors)->rpermIndices;
1867: delete (*trifactors)->cpermIndices;
1868: delete (*trifactors)->workVector;
1869: if (handle = (*trifactors)->handle) {
1870: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1871: }
1872: delete *trifactors;
1873: *trifactors = 0;
1874: }
1875: return(0);
1876: }