Actual source code: aijcusparse.cu
petsc-3.13.6 2020-09-29
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44: {
45: cusparseStatus_t stat;
46: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
49: cusparsestruct->stream = stream;
50: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
51: return(0);
52: }
54: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55: {
56: cusparseStatus_t stat;
57: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60: if (cusparsestruct->handle != handle) {
61: if (cusparsestruct->handle) {
62: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
63: }
64: cusparsestruct->handle = handle;
65: }
66: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
67: return(0);
68: }
70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71: {
72: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74: if (cusparsestruct->handle)
75: cusparsestruct->handle = 0;
76: return(0);
77: }
79: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80: {
82: *type = MATSOLVERCUSPARSE;
83: return(0);
84: }
86: /*MC
87: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92: algorithms are not recommended. This class does NOT support direct solver operations.
94: Level: beginner
96: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97: M*/
99: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100: {
102: PetscInt n = A->rmap->n;
105: MatCreate(PetscObjectComm((PetscObject)A),B);
106: (*B)->factortype = ftype;
107: MatSetSizes(*B,n,n,n,n);
108: MatSetType(*B,MATSEQAIJCUSPARSE);
110: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111: MatSetBlockSizesFromMats(*B,A,A);
112: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
120: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
121: return(0);
122: }
124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
129: switch (op) {
130: case MAT_CUSPARSE_MULT:
131: cusparsestruct->format = format;
132: break;
133: case MAT_CUSPARSE_ALL:
134: cusparsestruct->format = format;
135: break;
136: default:
137: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138: }
139: return(0);
140: }
142: /*@
143: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144: operation. Only the MatMult operation can use different GPU storage formats
145: for MPIAIJCUSPARSE matrices.
146: Not Collective
148: Input Parameters:
149: + A - Matrix of type SEQAIJCUSPARSE
150: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153: Output Parameter:
155: Level: intermediate
157: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: @*/
159: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160: {
165: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
166: return(0);
167: }
169: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170: {
171: PetscErrorCode ierr;
172: MatCUSPARSEStorageFormat format;
173: PetscBool flg;
174: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
178: if (A->factortype==MAT_FACTOR_NONE) {
179: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
181: if (flg) {
182: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
183: }
184: }
185: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
187: if (flg) {
188: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
189: }
190: PetscOptionsTail();
191: return(0);
193: }
195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {
200: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202: return(0);
203: }
205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {
210: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212: return(0);
213: }
215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {
220: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222: return(0);
223: }
225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {
230: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232: return(0);
233: }
235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
238: PetscInt n = A->rmap->n;
239: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241: cusparseStatus_t stat;
242: const PetscInt *ai = a->i,*aj = a->j,*vi;
243: const MatScalar *aa = a->a,*v;
244: PetscInt *AiLo, *AjLo;
245: PetscScalar *AALo;
246: PetscInt i,nz, nzLower, offset, rowOffset;
247: PetscErrorCode ierr;
248: cudaError_t cerr;
251: if (!n) return(0);
252: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253: try {
254: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255: nzLower=n+ai[n]-ai[1];
257: /* Allocate Space for the lower triangular matrix */
258: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
262: /* Fill the lower triangular matrix */
263: AiLo[0] = (PetscInt) 0;
264: AiLo[n] = nzLower;
265: AjLo[0] = (PetscInt) 0;
266: AALo[0] = (MatScalar) 1.0;
267: v = aa;
268: vi = aj;
269: offset = 1;
270: rowOffset= 1;
271: for (i=1; i<n; i++) {
272: nz = ai[i+1] - ai[i];
273: /* additional 1 for the term on the diagonal */
274: AiLo[i] = rowOffset;
275: rowOffset += nz+1;
277: PetscArraycpy(&(AjLo[offset]), vi, nz);
278: PetscArraycpy(&(AALo[offset]), v, nz);
280: offset += nz;
281: AjLo[offset] = (PetscInt) i;
282: AALo[offset] = (MatScalar) 1.0;
283: offset += 1;
285: v += nz;
286: vi += nz;
287: }
289: /* allocate space for the triangular factor information */
290: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
292: /* Create the matrix description */
293: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
299: /* Create the solve analysis information */
300: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
302: /* set the operation */
303: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
305: /* set the matrix */
306: loTriFactor->csrMat = new CsrMatrix;
307: loTriFactor->csrMat->num_rows = n;
308: loTriFactor->csrMat->num_cols = n;
309: loTriFactor->csrMat->num_entries = nzLower;
311: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
314: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
317: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
320: /* perform the solve analysis */
321: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
326: /* assign the pointer. Is this really necessary? */
327: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
329: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
333: } catch(char *ex) {
334: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335: }
336: }
337: return(0);
338: }
340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
343: PetscInt n = A->rmap->n;
344: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346: cusparseStatus_t stat;
347: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
348: const MatScalar *aa = a->a,*v;
349: PetscInt *AiUp, *AjUp;
350: PetscScalar *AAUp;
351: PetscInt i,nz, nzUpper, offset;
352: PetscErrorCode ierr;
353: cudaError_t cerr;
356: if (!n) return(0);
357: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358: try {
359: /* next, figure out the number of nonzeros in the upper triangular matrix. */
360: nzUpper = adiag[0]-adiag[n];
362: /* Allocate Space for the upper triangular matrix */
363: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
367: /* Fill the upper triangular matrix */
368: AiUp[0]=(PetscInt) 0;
369: AiUp[n]=nzUpper;
370: offset = nzUpper;
371: for (i=n-1; i>=0; i--) {
372: v = aa + adiag[i+1] + 1;
373: vi = aj + adiag[i+1] + 1;
375: /* number of elements NOT on the diagonal */
376: nz = adiag[i] - adiag[i+1]-1;
378: /* decrement the offset */
379: offset -= (nz+1);
381: /* first, set the diagonal elements */
382: AjUp[offset] = (PetscInt) i;
383: AAUp[offset] = (MatScalar)1./v[nz];
384: AiUp[i] = AiUp[i+1] - (nz+1);
386: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
387: PetscArraycpy(&(AAUp[offset+1]), v, nz);
388: }
390: /* allocate space for the triangular factor information */
391: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
393: /* Create the matrix description */
394: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
400: /* Create the solve analysis information */
401: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
403: /* set the operation */
404: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
406: /* set the matrix */
407: upTriFactor->csrMat = new CsrMatrix;
408: upTriFactor->csrMat->num_rows = n;
409: upTriFactor->csrMat->num_cols = n;
410: upTriFactor->csrMat->num_entries = nzUpper;
412: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
415: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
418: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
421: /* perform the solve analysis */
422: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
427: /* assign the pointer. Is this really necessary? */
428: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
430: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
434: } catch(char *ex) {
435: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436: }
437: }
438: return(0);
439: }
441: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442: {
443: PetscErrorCode ierr;
444: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
445: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446: IS isrow = a->row,iscol = a->icol;
447: PetscBool row_identity,col_identity;
448: const PetscInt *r,*c;
449: PetscInt n = A->rmap->n;
452: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
453: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
455: cusparseTriFactors->workVector = new THRUSTARRAY(n);
456: cusparseTriFactors->nnz=a->nz;
458: A->offloadmask = PETSC_OFFLOAD_BOTH;
459: /* lower triangular indices */
460: ISGetIndices(isrow,&r);
461: ISIdentity(isrow,&row_identity);
462: if (!row_identity) {
463: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464: cusparseTriFactors->rpermIndices->assign(r, r+n);
465: }
466: ISRestoreIndices(isrow,&r);
468: /* upper triangular indices */
469: ISGetIndices(iscol,&c);
470: ISIdentity(iscol,&col_identity);
471: if (!col_identity) {
472: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473: cusparseTriFactors->cpermIndices->assign(c, c+n);
474: }
476: if (!row_identity && !col_identity) {
477: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
478: } else if(!row_identity) {
479: PetscLogCpuToGpu(n*sizeof(PetscInt));
480: } else if(!col_identity) {
481: PetscLogCpuToGpu(n*sizeof(PetscInt));
482: }
484: ISRestoreIndices(iscol,&c);
485: return(0);
486: }
488: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489: {
490: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
491: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494: cusparseStatus_t stat;
495: PetscErrorCode ierr;
496: cudaError_t cerr;
497: PetscInt *AiUp, *AjUp;
498: PetscScalar *AAUp;
499: PetscScalar *AALo;
500: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
502: const PetscInt *ai = b->i,*aj = b->j,*vj;
503: const MatScalar *aa = b->a,*v;
506: if (!n) return(0);
507: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508: try {
509: /* Allocate Space for the upper triangular matrix */
510: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
515: /* Fill the upper triangular matrix */
516: AiUp[0]=(PetscInt) 0;
517: AiUp[n]=nzUpper;
518: offset = 0;
519: for (i=0; i<n; i++) {
520: /* set the pointers */
521: v = aa + ai[i];
522: vj = aj + ai[i];
523: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
525: /* first, set the diagonal elements */
526: AjUp[offset] = (PetscInt) i;
527: AAUp[offset] = (MatScalar)1.0/v[nz];
528: AiUp[i] = offset;
529: AALo[offset] = (MatScalar)1.0/v[nz];
531: offset+=1;
532: if (nz>0) {
533: PetscArraycpy(&(AjUp[offset]), vj, nz);
534: PetscArraycpy(&(AAUp[offset]), v, nz);
535: for (j=offset; j<offset+nz; j++) {
536: AAUp[j] = -AAUp[j];
537: AALo[j] = AAUp[j]/v[nz];
538: }
539: offset+=nz;
540: }
541: }
543: /* allocate space for the triangular factor information */
544: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
546: /* Create the matrix description */
547: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
553: /* Create the solve analysis information */
554: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
556: /* set the operation */
557: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
559: /* set the matrix */
560: upTriFactor->csrMat = new CsrMatrix;
561: upTriFactor->csrMat->num_rows = A->rmap->n;
562: upTriFactor->csrMat->num_cols = A->cmap->n;
563: upTriFactor->csrMat->num_entries = a->nz;
565: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
568: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
571: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
574: /* perform the solve analysis */
575: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580: /* assign the pointer. Is this really necessary? */
581: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
583: /* allocate space for the triangular factor information */
584: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
586: /* Create the matrix description */
587: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
593: /* Create the solve analysis information */
594: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
596: /* set the operation */
597: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
599: /* set the matrix */
600: loTriFactor->csrMat = new CsrMatrix;
601: loTriFactor->csrMat->num_rows = A->rmap->n;
602: loTriFactor->csrMat->num_cols = A->cmap->n;
603: loTriFactor->csrMat->num_entries = a->nz;
605: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
608: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
611: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
615: /* perform the solve analysis */
616: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
621: /* assign the pointer. Is this really necessary? */
622: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
624: A->offloadmask = PETSC_OFFLOAD_BOTH;
625: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629: } catch(char *ex) {
630: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631: }
632: }
633: return(0);
634: }
636: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637: {
638: PetscErrorCode ierr;
639: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
640: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641: IS ip = a->row;
642: const PetscInt *rip;
643: PetscBool perm_identity;
644: PetscInt n = A->rmap->n;
647: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
648: cusparseTriFactors->workVector = new THRUSTARRAY(n);
649: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
651: /* lower triangular indices */
652: ISGetIndices(ip,&rip);
653: ISIdentity(ip,&perm_identity);
654: if (!perm_identity) {
655: IS iip;
656: const PetscInt *irip;
658: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
659: ISGetIndices(iip,&irip);
660: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664: ISRestoreIndices(iip,&irip);
665: ISDestroy(&iip);
666: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
667: }
668: ISRestoreIndices(ip,&rip);
669: return(0);
670: }
672: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673: {
674: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
675: IS isrow = b->row,iscol = b->col;
676: PetscBool row_identity,col_identity;
680: MatLUFactorNumeric_SeqAIJ(B,A,info);
681: /* determine which version of MatSolve needs to be used. */
682: ISIdentity(isrow,&row_identity);
683: ISIdentity(iscol,&col_identity);
684: if (row_identity && col_identity) {
685: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687: B->ops->matsolve = NULL;
688: B->ops->matsolvetranspose = NULL;
689: } else {
690: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692: B->ops->matsolve = NULL;
693: B->ops->matsolvetranspose = NULL;
694: }
696: /* get the triangular factors */
697: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
698: return(0);
699: }
701: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702: {
703: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
704: IS ip = b->row;
705: PetscBool perm_identity;
709: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
711: /* determine which version of MatSolve needs to be used. */
712: ISIdentity(ip,&perm_identity);
713: if (perm_identity) {
714: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716: B->ops->matsolve = NULL;
717: B->ops->matsolvetranspose = NULL;
718: } else {
719: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721: B->ops->matsolve = NULL;
722: B->ops->matsolvetranspose = NULL;
723: }
725: /* get the triangular factors */
726: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
727: return(0);
728: }
730: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731: {
732: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737: cusparseStatus_t stat;
738: cusparseIndexBase_t indexBase;
739: cusparseMatrixType_t matrixType;
740: cusparseFillMode_t fillMode;
741: cusparseDiagType_t diagType;
745: /*********************************************/
746: /* Now the Transpose of the Lower Tri Factor */
747: /*********************************************/
749: /* allocate space for the transpose of the lower triangular factor */
750: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
752: /* set the matrix descriptors of the lower triangular factor */
753: matrixType = cusparseGetMatType(loTriFactor->descr);
754: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757: diagType = cusparseGetMatDiagType(loTriFactor->descr);
759: /* Create the matrix description */
760: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
766: /* Create the solve analysis information */
767: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
769: /* set the operation */
770: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772: /* allocate GPU space for the CSC of the lower triangular factor*/
773: loTriFactorT->csrMat = new CsrMatrix;
774: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
781: /* compute the transpose of the lower triangular factor, i.e. the CSC */
782: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784: loTriFactor->csrMat->values->data().get(),
785: loTriFactor->csrMat->row_offsets->data().get(),
786: loTriFactor->csrMat->column_indices->data().get(),
787: loTriFactorT->csrMat->values->data().get(),
788: loTriFactorT->csrMat->column_indices->data().get(),
789: loTriFactorT->csrMat->row_offsets->data().get(),
790: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
792: /* perform the solve analysis on the transposed matrix */
793: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797: loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
799: /* assign the pointer. Is this really necessary? */
800: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
802: /*********************************************/
803: /* Now the Transpose of the Upper Tri Factor */
804: /*********************************************/
806: /* allocate space for the transpose of the upper triangular factor */
807: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
809: /* set the matrix descriptors of the upper triangular factor */
810: matrixType = cusparseGetMatType(upTriFactor->descr);
811: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814: diagType = cusparseGetMatDiagType(upTriFactor->descr);
816: /* Create the matrix description */
817: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
823: /* Create the solve analysis information */
824: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
826: /* set the operation */
827: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
829: /* allocate GPU space for the CSC of the upper triangular factor*/
830: upTriFactorT->csrMat = new CsrMatrix;
831: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
838: /* compute the transpose of the upper triangular factor, i.e. the CSC */
839: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841: upTriFactor->csrMat->values->data().get(),
842: upTriFactor->csrMat->row_offsets->data().get(),
843: upTriFactor->csrMat->column_indices->data().get(),
844: upTriFactorT->csrMat->values->data().get(),
845: upTriFactorT->csrMat->column_indices->data().get(),
846: upTriFactorT->csrMat->row_offsets->data().get(),
847: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
849: /* perform the solve analysis on the transposed matrix */
850: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854: upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
856: /* assign the pointer. Is this really necessary? */
857: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858: return(0);
859: }
861: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862: {
863: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
867: cusparseStatus_t stat;
868: cusparseIndexBase_t indexBase;
869: cudaError_t err;
870: PetscErrorCode ierr;
874: /* allocate space for the triangular factor information */
875: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877: indexBase = cusparseGetMatIndexBase(matstruct->descr);
878: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
881: /* set alpha and beta */
882: err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
883: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885: err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
890: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892: CsrMatrix *matrixT= new CsrMatrix;
893: matrixT->num_rows = A->cmap->n;
894: matrixT->num_cols = A->rmap->n;
895: matrixT->num_entries = a->nz;
896: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898: matrixT->values = new THRUSTARRAY(a->nz);
900: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
901: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
902: /* compute the transpose, i.e. the CSC */
903: indexBase = cusparseGetMatIndexBase(matstruct->descr);
904: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
905: A->cmap->n, matrix->num_entries,
906: matrix->values->data().get(),
907: cusparsestruct->rowoffsets_gpu->data().get(),
908: matrix->column_indices->data().get(),
909: matrixT->values->data().get(),
910: matrixT->column_indices->data().get(),
911: matrixT->row_offsets->data().get(),
912: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
913: /* assign the pointer */
914: matstructT->mat = matrixT;
915: PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
916: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917: /* First convert HYB to CSR */
918: CsrMatrix *temp= new CsrMatrix;
919: temp->num_rows = A->rmap->n;
920: temp->num_cols = A->cmap->n;
921: temp->num_entries = a->nz;
922: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923: temp->column_indices = new THRUSTINTARRAY32(a->nz);
924: temp->values = new THRUSTARRAY(a->nz);
927: stat = cusparse_hyb2csr(cusparsestruct->handle,
928: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929: temp->values->data().get(),
930: temp->row_offsets->data().get(),
931: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
933: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934: CsrMatrix *tempT= new CsrMatrix;
935: tempT->num_rows = A->rmap->n;
936: tempT->num_cols = A->cmap->n;
937: tempT->num_entries = a->nz;
938: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940: tempT->values = new THRUSTARRAY(a->nz);
942: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943: temp->num_cols, temp->num_entries,
944: temp->values->data().get(),
945: temp->row_offsets->data().get(),
946: temp->column_indices->data().get(),
947: tempT->values->data().get(),
948: tempT->column_indices->data().get(),
949: tempT->row_offsets->data().get(),
950: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
952: /* Last, convert CSC to HYB */
953: cusparseHybMat_t hybMat;
954: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
955: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958: matstructT->descr, tempT->values->data().get(),
959: tempT->row_offsets->data().get(),
960: tempT->column_indices->data().get(),
961: hybMat, 0, partition);CHKERRCUSPARSE(stat);
963: /* assign the pointer */
964: matstructT->mat = hybMat;
965: PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));
967: /* delete temporaries */
968: if (tempT) {
969: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972: delete (CsrMatrix*) tempT;
973: }
974: if (temp) {
975: if (temp->values) delete (THRUSTARRAY*) temp->values;
976: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978: delete (CsrMatrix*) temp;
979: }
980: }
981: /* assign the compressed row indices */
982: matstructT->cprowIndices = new THRUSTINTARRAY;
983: matstructT->cprowIndices->resize(A->cmap->n);
984: thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
985: /* assign the pointer */
986: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
987: return(0);
988: }
990: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
991: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
992: {
993: PetscInt n = xx->map->n;
994: const PetscScalar *barray;
995: PetscScalar *xarray;
996: thrust::device_ptr<const PetscScalar> bGPU;
997: thrust::device_ptr<PetscScalar> xGPU;
998: cusparseStatus_t stat;
999: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1001: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1002: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1003: PetscErrorCode ierr;
1004: cudaError_t cerr;
1007: /* Analyze the matrix and create the transpose ... on the fly */
1008: if (!loTriFactorT && !upTriFactorT) {
1009: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1010: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012: }
1014: /* Get the GPU pointers */
1015: VecCUDAGetArrayWrite(xx,&xarray);
1016: VecCUDAGetArrayRead(bb,&barray);
1017: xGPU = thrust::device_pointer_cast(xarray);
1018: bGPU = thrust::device_pointer_cast(barray);
1020: PetscLogGpuTimeBegin();
1021: /* First, reorder with the row permutation */
1022: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1023: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1024: xGPU);
1026: /* First, solve U */
1027: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1029: upTriFactorT->csrMat->values->data().get(),
1030: upTriFactorT->csrMat->row_offsets->data().get(),
1031: upTriFactorT->csrMat->column_indices->data().get(),
1032: upTriFactorT->solveInfo,
1033: xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1035: /* Then, solve L */
1036: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1038: loTriFactorT->csrMat->values->data().get(),
1039: loTriFactorT->csrMat->row_offsets->data().get(),
1040: loTriFactorT->csrMat->column_indices->data().get(),
1041: loTriFactorT->solveInfo,
1042: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1044: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1046: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1047: tempGPU->begin());
1049: /* Copy the temporary to the full solution. */
1050: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1052: /* restore */
1053: VecCUDARestoreArrayRead(bb,&barray);
1054: VecCUDARestoreArrayWrite(xx,&xarray);
1055: cerr = WaitForGPU();CHKERRCUDA(cerr);
1056: PetscLogGpuTimeEnd();
1057: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1058: return(0);
1059: }
1061: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1062: {
1063: const PetscScalar *barray;
1064: PetscScalar *xarray;
1065: cusparseStatus_t stat;
1066: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1067: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1068: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1069: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1070: PetscErrorCode ierr;
1071: cudaError_t cerr;
1074: /* Analyze the matrix and create the transpose ... on the fly */
1075: if (!loTriFactorT && !upTriFactorT) {
1076: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1077: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079: }
1081: /* Get the GPU pointers */
1082: VecCUDAGetArrayWrite(xx,&xarray);
1083: VecCUDAGetArrayRead(bb,&barray);
1085: PetscLogGpuTimeBegin();
1086: /* First, solve U */
1087: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1089: upTriFactorT->csrMat->values->data().get(),
1090: upTriFactorT->csrMat->row_offsets->data().get(),
1091: upTriFactorT->csrMat->column_indices->data().get(),
1092: upTriFactorT->solveInfo,
1093: barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1095: /* Then, solve L */
1096: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1098: loTriFactorT->csrMat->values->data().get(),
1099: loTriFactorT->csrMat->row_offsets->data().get(),
1100: loTriFactorT->csrMat->column_indices->data().get(),
1101: loTriFactorT->solveInfo,
1102: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1104: /* restore */
1105: VecCUDARestoreArrayRead(bb,&barray);
1106: VecCUDARestoreArrayWrite(xx,&xarray);
1107: cerr = WaitForGPU();CHKERRCUDA(cerr);
1108: PetscLogGpuTimeEnd();
1109: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1110: return(0);
1111: }
1113: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1114: {
1115: const PetscScalar *barray;
1116: PetscScalar *xarray;
1117: thrust::device_ptr<const PetscScalar> bGPU;
1118: thrust::device_ptr<PetscScalar> xGPU;
1119: cusparseStatus_t stat;
1120: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1121: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1122: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1123: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1124: PetscErrorCode ierr;
1125: cudaError_t cerr;
1129: /* Get the GPU pointers */
1130: VecCUDAGetArrayWrite(xx,&xarray);
1131: VecCUDAGetArrayRead(bb,&barray);
1132: xGPU = thrust::device_pointer_cast(xarray);
1133: bGPU = thrust::device_pointer_cast(barray);
1135: PetscLogGpuTimeBegin();
1136: /* First, reorder with the row permutation */
1137: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1138: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1139: tempGPU->begin());
1141: /* Next, solve L */
1142: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1143: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1144: loTriFactor->csrMat->values->data().get(),
1145: loTriFactor->csrMat->row_offsets->data().get(),
1146: loTriFactor->csrMat->column_indices->data().get(),
1147: loTriFactor->solveInfo,
1148: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1150: /* Then, solve U */
1151: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1152: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1153: upTriFactor->csrMat->values->data().get(),
1154: upTriFactor->csrMat->row_offsets->data().get(),
1155: upTriFactor->csrMat->column_indices->data().get(),
1156: upTriFactor->solveInfo,
1157: xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1159: /* Last, reorder with the column permutation */
1160: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1161: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1162: xGPU);
1164: VecCUDARestoreArrayRead(bb,&barray);
1165: VecCUDARestoreArrayWrite(xx,&xarray);
1166: cerr = WaitForGPU();CHKERRCUDA(cerr);
1167: PetscLogGpuTimeEnd();
1168: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1169: return(0);
1170: }
1172: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1173: {
1174: const PetscScalar *barray;
1175: PetscScalar *xarray;
1176: cusparseStatus_t stat;
1177: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1178: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1179: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1180: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1181: PetscErrorCode ierr;
1182: cudaError_t cerr;
1185: /* Get the GPU pointers */
1186: VecCUDAGetArrayWrite(xx,&xarray);
1187: VecCUDAGetArrayRead(bb,&barray);
1189: PetscLogGpuTimeBegin();
1190: /* First, solve L */
1191: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1192: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1193: loTriFactor->csrMat->values->data().get(),
1194: loTriFactor->csrMat->row_offsets->data().get(),
1195: loTriFactor->csrMat->column_indices->data().get(),
1196: loTriFactor->solveInfo,
1197: barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1199: /* Next, solve U */
1200: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1201: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1202: upTriFactor->csrMat->values->data().get(),
1203: upTriFactor->csrMat->row_offsets->data().get(),
1204: upTriFactor->csrMat->column_indices->data().get(),
1205: upTriFactor->solveInfo,
1206: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1208: VecCUDARestoreArrayRead(bb,&barray);
1209: VecCUDARestoreArrayWrite(xx,&xarray);
1210: cerr = WaitForGPU();CHKERRCUDA(cerr);
1211: PetscLogGpuTimeEnd();
1212: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1213: return(0);
1214: }
1216: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1217: {
1218: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1219: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1220: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1221: PetscInt m = A->rmap->n,*ii,*ridx;
1222: PetscErrorCode ierr;
1223: cusparseStatus_t stat;
1224: cudaError_t err;
1227: if (A->boundtocpu) return(0);
1228: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1229: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1230: if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1231: /* Copy values only */
1232: CsrMatrix *mat,*matT;
1233: mat = (CsrMatrix*)cusparsestruct->mat->mat;
1234: mat->values->assign(a->a, a->a+a->nz);
1235: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1237: /* Update matT when it was built before */
1238: if (cusparsestruct->matTranspose) {
1239: cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1240: matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1241: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1242: A->cmap->n, mat->num_entries,
1243: mat->values->data().get(),
1244: cusparsestruct->rowoffsets_gpu->data().get(),
1245: mat->column_indices->data().get(),
1246: matT->values->data().get(),
1247: matT->column_indices->data().get(),
1248: matT->row_offsets->data().get(),
1249: CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1250: }
1251: } else {
1252: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1253: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1254: delete cusparsestruct->workVector;
1255: delete cusparsestruct->rowoffsets_gpu;
1256: try {
1257: cusparsestruct->nonzerorow=0;
1258: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1260: if (a->compressedrow.use) {
1261: m = a->compressedrow.nrows;
1262: ii = a->compressedrow.i;
1263: ridx = a->compressedrow.rindex;
1264: } else {
1265: /* Forcing compressed row on the GPU */
1266: int k=0;
1267: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1268: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1269: ii[0]=0;
1270: for (int j = 0; j<m; j++) {
1271: if ((a->i[j+1]-a->i[j])>0) {
1272: ii[k] = a->i[j];
1273: ridx[k]= j;
1274: k++;
1275: }
1276: }
1277: ii[cusparsestruct->nonzerorow] = a->nz;
1278: m = cusparsestruct->nonzerorow;
1279: }
1281: /* allocate space for the triangular factor information */
1282: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1283: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1284: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1285: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1287: err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
1288: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1289: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1290: err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1291: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1292: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1293: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1295: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1296: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1297: /* set the matrix */
1298: CsrMatrix *matrix= new CsrMatrix;
1299: matrix->num_rows = m;
1300: matrix->num_cols = A->cmap->n;
1301: matrix->num_entries = a->nz;
1302: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1303: matrix->row_offsets->assign(ii, ii + m+1);
1305: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1306: matrix->column_indices->assign(a->j, a->j+a->nz);
1308: matrix->values = new THRUSTARRAY(a->nz);
1309: matrix->values->assign(a->a, a->a+a->nz);
1311: /* assign the pointer */
1312: matstruct->mat = matrix;
1314: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1315: CsrMatrix *matrix= new CsrMatrix;
1316: matrix->num_rows = m;
1317: matrix->num_cols = A->cmap->n;
1318: matrix->num_entries = a->nz;
1319: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1320: matrix->row_offsets->assign(ii, ii + m+1);
1322: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1323: matrix->column_indices->assign(a->j, a->j+a->nz);
1325: matrix->values = new THRUSTARRAY(a->nz);
1326: matrix->values->assign(a->a, a->a+a->nz);
1328: cusparseHybMat_t hybMat;
1329: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1330: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1331: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1332: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1333: matstruct->descr, matrix->values->data().get(),
1334: matrix->row_offsets->data().get(),
1335: matrix->column_indices->data().get(),
1336: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1337: /* assign the pointer */
1338: matstruct->mat = hybMat;
1340: if (matrix) {
1341: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1342: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1343: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1344: delete (CsrMatrix*)matrix;
1345: }
1346: }
1348: /* assign the compressed row indices */
1349: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1350: matstruct->cprowIndices->assign(ridx,ridx+m);
1351: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1353: /* assign the pointer */
1354: cusparsestruct->mat = matstruct;
1356: if (!a->compressedrow.use) {
1357: PetscFree(ii);
1358: PetscFree(ridx);
1359: }
1360: cusparsestruct->workVector = new THRUSTARRAY(m);
1361: } catch(char *ex) {
1362: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1363: }
1364: cusparsestruct->nonzerostate = A->nonzerostate;
1365: }
1366: err = WaitForGPU();CHKERRCUDA(err);
1367: A->offloadmask = PETSC_OFFLOAD_BOTH;
1368: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1369: }
1370: return(0);
1371: }
1373: struct VecCUDAPlusEquals
1374: {
1375: template <typename Tuple>
1376: __host__ __device__
1377: void operator()(Tuple t)
1378: {
1379: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1380: }
1381: };
1383: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1384: {
1388: MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1389: return(0);
1390: }
1392: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1393: {
1394: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1395: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1396: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1397: const PetscScalar *xarray;
1398: PetscScalar *yarray;
1399: PetscErrorCode ierr;
1400: cudaError_t cerr;
1401: cusparseStatus_t stat;
1404: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1405: MatSeqAIJCUSPARSECopyToGPU(A);
1406: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1407: if (!matstructT) {
1408: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1409: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1410: }
1411: VecCUDAGetArrayRead(xx,&xarray);
1412: VecCUDAGetArrayWrite(yy,&yarray);
1413: if (yy->map->n) {
1414: PetscInt n = yy->map->n;
1415: cudaError_t err;
1416: err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1417: }
1419: PetscLogGpuTimeBegin();
1420: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1421: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1422: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423: mat->num_rows, mat->num_cols,
1424: mat->num_entries, matstructT->alpha, matstructT->descr,
1425: mat->values->data().get(), mat->row_offsets->data().get(),
1426: mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1427: yarray);CHKERRCUSPARSE(stat);
1428: } else {
1429: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1430: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1431: matstructT->alpha, matstructT->descr, hybMat,
1432: xarray, matstructT->beta_zero,
1433: yarray);CHKERRCUSPARSE(stat);
1434: }
1435: VecCUDARestoreArrayRead(xx,&xarray);
1436: VecCUDARestoreArrayWrite(yy,&yarray);
1437: cerr = WaitForGPU();CHKERRCUDA(cerr);
1438: PetscLogGpuTimeEnd();
1439: PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1440: return(0);
1441: }
1444: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1445: {
1446: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1447: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1448: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1449: const PetscScalar *xarray;
1450: PetscScalar *zarray,*dptr,*beta;
1451: PetscErrorCode ierr;
1452: cudaError_t cerr;
1453: cusparseStatus_t stat;
1454: PetscBool cmpr; /* if the matrix has been compressed (zero rows) */
1457: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1458: MatSeqAIJCUSPARSECopyToGPU(A);
1459: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1460: try {
1461: cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1462: VecCUDAGetArrayRead(xx,&xarray);
1463: if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1464: VecCUDAGetArray(zz,&zarray);
1465: } else {
1466: VecCUDAGetArrayWrite(zz,&zarray);
1467: }
1468: dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1469: beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1471: PetscLogGpuTimeBegin();
1472: /* csr_spmv is multiply add */
1473: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1474: /* here we need to be careful to set the number of rows in the multiply to the
1475: number of compressed rows in the matrix ... which is equivalent to the
1476: size of the workVector */
1477: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1478: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1479: mat->num_rows, mat->num_cols,
1480: mat->num_entries, matstruct->alpha, matstruct->descr,
1481: mat->values->data().get(), mat->row_offsets->data().get(),
1482: mat->column_indices->data().get(), xarray, beta,
1483: dptr);CHKERRCUSPARSE(stat);
1484: } else {
1485: if (cusparsestruct->workVector->size()) {
1486: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1487: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1488: matstruct->alpha, matstruct->descr, hybMat,
1489: xarray, beta,
1490: dptr);CHKERRCUSPARSE(stat);
1491: }
1492: }
1493: PetscLogGpuTimeEnd();
1495: if (yy) {
1496: if (dptr != zarray) {
1497: VecCopy_SeqCUDA(yy,zz);
1498: } else if (zz != yy) {
1499: VecAXPY_SeqCUDA(zz,1.0,yy);
1500: }
1501: } else if (dptr != zarray) {
1502: VecSet_SeqCUDA(zz,0);
1503: }
1504: /* scatter the data from the temporary into the full vector with a += operation */
1505: PetscLogGpuTimeBegin();
1506: if (dptr != zarray) {
1507: thrust::device_ptr<PetscScalar> zptr;
1509: zptr = thrust::device_pointer_cast(zarray);
1510: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1511: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1512: VecCUDAPlusEquals());
1513: }
1514: PetscLogGpuTimeEnd();
1515: VecCUDARestoreArrayRead(xx,&xarray);
1516: if (yy && !cmpr) {
1517: VecCUDARestoreArray(zz,&zarray);
1518: } else {
1519: VecCUDARestoreArrayWrite(zz,&zarray);
1520: }
1521: } catch(char *ex) {
1522: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1523: }
1524: cerr = WaitForGPU();CHKERRCUDA(cerr);
1525: PetscLogGpuFlops(2.0*a->nz);
1526: return(0);
1527: }
1529: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1530: {
1531: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1532: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1533: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1534: const PetscScalar *xarray;
1535: PetscScalar *zarray,*beta;
1536: PetscErrorCode ierr;
1537: cudaError_t cerr;
1538: cusparseStatus_t stat;
1541: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1542: MatSeqAIJCUSPARSECopyToGPU(A);
1543: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544: if (!matstructT) {
1545: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1546: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1547: }
1549: /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1550: try {
1551: VecCopy_SeqCUDA(yy,zz);
1552: VecCUDAGetArrayRead(xx,&xarray);
1553: VecCUDAGetArray(zz,&zarray);
1554: beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1556: PetscLogGpuTimeBegin();
1557: /* multiply add with matrix transpose */
1558: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1559: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1560: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1561: mat->num_rows, mat->num_cols,
1562: mat->num_entries, matstructT->alpha, matstructT->descr,
1563: mat->values->data().get(), mat->row_offsets->data().get(),
1564: mat->column_indices->data().get(), xarray, beta,
1565: zarray);CHKERRCUSPARSE(stat);
1566: } else {
1567: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1568: if (cusparsestruct->workVector->size()) {
1569: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1570: matstructT->alpha, matstructT->descr, hybMat,
1571: xarray, beta,
1572: zarray);CHKERRCUSPARSE(stat);
1573: }
1574: }
1575: PetscLogGpuTimeEnd();
1577: if (zz != yy) {VecAXPY_SeqCUDA(zz,1.0,yy);}
1578: VecCUDARestoreArrayRead(xx,&xarray);
1579: VecCUDARestoreArray(zz,&zarray);
1580: } catch(char *ex) {
1581: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1582: }
1583: cerr = WaitForGPU();CHKERRCUDA(cerr);
1584: PetscLogGpuFlops(2.0*a->nz);
1585: return(0);
1586: }
1588: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1589: {
1593: MatAssemblyEnd_SeqAIJ(A,mode);
1594: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
1595: if (A->factortype == MAT_FACTOR_NONE) {
1596: MatSeqAIJCUSPARSECopyToGPU(A);
1597: }
1598: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1599: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1600: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1601: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1602: return(0);
1603: }
1605: /* --------------------------------------------------------------------------------*/
1606: /*@
1607: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1608: (the default parallel PETSc format). This matrix will ultimately pushed down
1609: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1610: assembly performance the user should preallocate the matrix storage by setting
1611: the parameter nz (or the array nnz). By setting these parameters accurately,
1612: performance during matrix assembly can be increased by more than a factor of 50.
1614: Collective
1616: Input Parameters:
1617: + comm - MPI communicator, set to PETSC_COMM_SELF
1618: . m - number of rows
1619: . n - number of columns
1620: . nz - number of nonzeros per row (same for all rows)
1621: - nnz - array containing the number of nonzeros in the various rows
1622: (possibly different for each row) or NULL
1624: Output Parameter:
1625: . A - the matrix
1627: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1628: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1629: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1631: Notes:
1632: If nnz is given then nz is ignored
1634: The AIJ format (also called the Yale sparse matrix format or
1635: compressed row storage), is fully compatible with standard Fortran 77
1636: storage. That is, the stored row and column indices can begin at
1637: either one (as in Fortran) or zero. See the users' manual for details.
1639: Specify the preallocated storage with either nz or nnz (not both).
1640: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1641: allocation. For large problems you MUST preallocate memory or you
1642: will get TERRIBLE performance, see the users' manual chapter on matrices.
1644: By default, this format uses inodes (identical nodes) when possible, to
1645: improve numerical efficiency of matrix-vector products and solves. We
1646: search for consecutive rows with the same nonzero structure, thereby
1647: reusing matrix information to achieve increased efficiency.
1649: Level: intermediate
1651: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1652: @*/
1653: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1654: {
1658: MatCreate(comm,A);
1659: MatSetSizes(*A,m,n,m,n);
1660: MatSetType(*A,MATSEQAIJCUSPARSE);
1661: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1662: return(0);
1663: }
1665: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1666: {
1667: PetscErrorCode ierr;
1670: if (A->factortype==MAT_FACTOR_NONE) {
1671: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1672: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1673: }
1674: } else {
1675: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1676: }
1677: MatDestroy_SeqAIJ(A);
1678: return(0);
1679: }
1681: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1682: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1683: {
1685: Mat C;
1686: cusparseStatus_t stat;
1687: cusparseHandle_t handle=0;
1690: MatDuplicate_SeqAIJ(A,cpvalues,B);
1691: C = *B;
1692: PetscFree(C->defaultvectype);
1693: PetscStrallocpy(VECCUDA,&C->defaultvectype);
1695: /* inject CUSPARSE-specific stuff */
1696: if (C->factortype==MAT_FACTOR_NONE) {
1697: /* you cannot check the inode.use flag here since the matrix was just created.
1698: now build a GPU matrix data structure */
1699: C->spptr = new Mat_SeqAIJCUSPARSE;
1700: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1701: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1702: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1703: ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1704: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1705: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1706: stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1707: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1708: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1709: } else {
1710: /* NEXT, set the pointers to the triangular factors */
1711: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1712: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1713: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1714: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1715: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1716: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1717: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1718: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1719: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1720: stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1721: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1722: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1723: }
1725: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1726: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1727: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1728: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1729: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1730: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1731: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1732: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1733: C->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
1735: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1737: C->boundtocpu = PETSC_FALSE;
1738: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1740: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1741: return(0);
1742: }
1745: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1746: {
1748: /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1749: If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1750: Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1751: TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1752: can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1753: if (A->offloadmask == PETSC_OFFLOAD_GPU) return(0);
1754: if (flg) {
1755: A->ops->mult = MatMult_SeqAIJ;
1756: A->ops->multadd = MatMultAdd_SeqAIJ;
1757: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
1758: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1759: A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
1760: A->ops->duplicate = MatDuplicate_SeqAIJ;
1761: } else {
1762: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1763: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1764: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1765: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1766: A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1767: A->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1768: A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1769: }
1770: A->boundtocpu = flg;
1771: return(0);
1772: }
1774: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1775: {
1777: cusparseStatus_t stat;
1778: cusparseHandle_t handle=0;
1781: PetscFree(B->defaultvectype);
1782: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1784: if (B->factortype==MAT_FACTOR_NONE) {
1785: /* you cannot check the inode.use flag here since the matrix was just created.
1786: now build a GPU matrix data structure */
1787: B->spptr = new Mat_SeqAIJCUSPARSE;
1788: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1789: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1790: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1791: ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1792: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1793: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1794: stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1795: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1796: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1797: } else {
1798: /* NEXT, set the pointers to the triangular factors */
1799: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1800: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1801: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1802: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1803: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1804: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1805: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1806: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1807: stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1808: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1809: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1810: }
1812: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1813: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1814: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1815: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1816: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1817: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1818: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1819: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1820: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
1822: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1824: B->boundtocpu = PETSC_FALSE;
1825: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1827: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1828: return(0);
1829: }
1831: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1832: {
1836: MatCreate_SeqAIJ(B);
1837: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1838: return(0);
1839: }
1841: /*MC
1842: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1844: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1845: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1846: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1848: Options Database Keys:
1849: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1850: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1851: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1853: Level: beginner
1855: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1856: M*/
1858: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1861: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1862: {
1866: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1867: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1868: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1869: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1870: return(0);
1871: }
1874: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1875: {
1876: cusparseStatus_t stat;
1877: cusparseHandle_t handle;
1880: if (*cusparsestruct) {
1881: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1882: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1883: delete (*cusparsestruct)->workVector;
1884: delete (*cusparsestruct)->rowoffsets_gpu;
1885: if (handle = (*cusparsestruct)->handle) {
1886: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1887: }
1888: delete *cusparsestruct;
1889: *cusparsestruct = 0;
1890: }
1891: return(0);
1892: }
1894: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1895: {
1897: if (*mat) {
1898: delete (*mat)->values;
1899: delete (*mat)->column_indices;
1900: delete (*mat)->row_offsets;
1901: delete *mat;
1902: *mat = 0;
1903: }
1904: return(0);
1905: }
1907: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1908: {
1909: cusparseStatus_t stat;
1910: PetscErrorCode ierr;
1913: if (*trifactor) {
1914: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1915: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1916: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1917: delete *trifactor;
1918: *trifactor = 0;
1919: }
1920: return(0);
1921: }
1923: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1924: {
1925: CsrMatrix *mat;
1926: cusparseStatus_t stat;
1927: cudaError_t err;
1930: if (*matstruct) {
1931: if ((*matstruct)->mat) {
1932: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1933: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1934: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1935: } else {
1936: mat = (CsrMatrix*)(*matstruct)->mat;
1937: CsrMatrix_Destroy(&mat);
1938: }
1939: }
1940: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1941: delete (*matstruct)->cprowIndices;
1942: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1943: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1944: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1945: delete *matstruct;
1946: *matstruct = 0;
1947: }
1948: return(0);
1949: }
1951: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1952: {
1953: cusparseHandle_t handle;
1954: cusparseStatus_t stat;
1957: if (*trifactors) {
1958: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1959: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1960: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1961: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1962: delete (*trifactors)->rpermIndices;
1963: delete (*trifactors)->cpermIndices;
1964: delete (*trifactors)->workVector;
1965: if (handle = (*trifactors)->handle) {
1966: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1967: }
1968: delete *trifactors;
1969: *trifactors = 0;
1970: }
1971: return(0);
1972: }