Actual source code: aijcusparse.cu
petsc-3.12.5 2020-03-29
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44: {
45: cusparseStatus_t stat;
46: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
49: cusparsestruct->stream = stream;
50: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51: return(0);
52: }
54: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55: {
56: cusparseStatus_t stat;
57: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60: if (cusparsestruct->handle != handle) {
61: if (cusparsestruct->handle) {
62: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63: }
64: cusparsestruct->handle = handle;
65: }
66: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67: return(0);
68: }
70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71: {
72: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74: if (cusparsestruct->handle)
75: cusparsestruct->handle = 0;
76: return(0);
77: }
79: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80: {
82: *type = MATSOLVERCUSPARSE;
83: return(0);
84: }
86: /*MC
87: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92: algorithms are not recommended. This class does NOT support direct solver operations.
94: Level: beginner
96: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97: M*/
99: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100: {
102: PetscInt n = A->rmap->n;
105: MatCreate(PetscObjectComm((PetscObject)A),B);
106: (*B)->factortype = ftype;
107: MatSetSizes(*B,n,n,n,n);
108: MatSetType(*B,MATSEQAIJCUSPARSE);
110: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111: MatSetBlockSizesFromMats(*B,A,A);
112: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
120: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
121: return(0);
122: }
124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
129: switch (op) {
130: case MAT_CUSPARSE_MULT:
131: cusparsestruct->format = format;
132: break;
133: case MAT_CUSPARSE_ALL:
134: cusparsestruct->format = format;
135: break;
136: default:
137: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138: }
139: return(0);
140: }
142: /*@
143: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144: operation. Only the MatMult operation can use different GPU storage formats
145: for MPIAIJCUSPARSE matrices.
146: Not Collective
148: Input Parameters:
149: + A - Matrix of type SEQAIJCUSPARSE
150: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153: Output Parameter:
155: Level: intermediate
157: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: @*/
159: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160: {
165: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
166: return(0);
167: }
169: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170: {
171: PetscErrorCode ierr;
172: MatCUSPARSEStorageFormat format;
173: PetscBool flg;
174: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
178: if (A->factortype==MAT_FACTOR_NONE) {
179: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
181: if (flg) {
182: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
183: }
184: }
185: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
187: if (flg) {
188: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
189: }
190: PetscOptionsTail();
191: return(0);
193: }
195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {
200: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202: return(0);
203: }
205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {
210: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212: return(0);
213: }
215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {
220: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222: return(0);
223: }
225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {
230: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232: return(0);
233: }
235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
238: PetscInt n = A->rmap->n;
239: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241: cusparseStatus_t stat;
242: const PetscInt *ai = a->i,*aj = a->j,*vi;
243: const MatScalar *aa = a->a,*v;
244: PetscInt *AiLo, *AjLo;
245: PetscScalar *AALo;
246: PetscInt i,nz, nzLower, offset, rowOffset;
247: PetscErrorCode ierr;
250: if (!n) return(0);
251: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
252: try {
253: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254: nzLower=n+ai[n]-ai[1];
256: /* Allocate Space for the lower triangular matrix */
257: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261: /* Fill the lower triangular matrix */
262: AiLo[0] = (PetscInt) 0;
263: AiLo[n] = nzLower;
264: AjLo[0] = (PetscInt) 0;
265: AALo[0] = (MatScalar) 1.0;
266: v = aa;
267: vi = aj;
268: offset = 1;
269: rowOffset= 1;
270: for (i=1; i<n; i++) {
271: nz = ai[i+1] - ai[i];
272: /* additional 1 for the term on the diagonal */
273: AiLo[i] = rowOffset;
274: rowOffset += nz+1;
276: PetscArraycpy(&(AjLo[offset]), vi, nz);
277: PetscArraycpy(&(AALo[offset]), v, nz);
279: offset += nz;
280: AjLo[offset] = (PetscInt) i;
281: AALo[offset] = (MatScalar) 1.0;
282: offset += 1;
284: v += nz;
285: vi += nz;
286: }
288: /* allocate space for the triangular factor information */
289: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291: /* Create the matrix description */
292: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298: /* Create the solve analysis information */
299: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301: /* set the operation */
302: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304: /* set the matrix */
305: loTriFactor->csrMat = new CsrMatrix;
306: loTriFactor->csrMat->num_rows = n;
307: loTriFactor->csrMat->num_cols = n;
308: loTriFactor->csrMat->num_entries = nzLower;
310: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319: /* perform the solve analysis */
320: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325: /* assign the pointer. Is this really necessary? */
326: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330: cudaFreeHost(AALo);CHKERRCUDA(ierr);
331: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
332: } catch(char *ex) {
333: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334: }
335: }
336: return(0);
337: }
339: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340: {
341: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
342: PetscInt n = A->rmap->n;
343: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345: cusparseStatus_t stat;
346: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
347: const MatScalar *aa = a->a,*v;
348: PetscInt *AiUp, *AjUp;
349: PetscScalar *AAUp;
350: PetscInt i,nz, nzUpper, offset;
351: PetscErrorCode ierr;
354: if (!n) return(0);
355: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
356: try {
357: /* next, figure out the number of nonzeros in the upper triangular matrix. */
358: nzUpper = adiag[0]-adiag[n];
360: /* Allocate Space for the upper triangular matrix */
361: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
365: /* Fill the upper triangular matrix */
366: AiUp[0]=(PetscInt) 0;
367: AiUp[n]=nzUpper;
368: offset = nzUpper;
369: for (i=n-1; i>=0; i--) {
370: v = aa + adiag[i+1] + 1;
371: vi = aj + adiag[i+1] + 1;
373: /* number of elements NOT on the diagonal */
374: nz = adiag[i] - adiag[i+1]-1;
376: /* decrement the offset */
377: offset -= (nz+1);
379: /* first, set the diagonal elements */
380: AjUp[offset] = (PetscInt) i;
381: AAUp[offset] = (MatScalar)1./v[nz];
382: AiUp[i] = AiUp[i+1] - (nz+1);
384: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
385: PetscArraycpy(&(AAUp[offset+1]), v, nz);
386: }
388: /* allocate space for the triangular factor information */
389: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391: /* Create the matrix description */
392: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
398: /* Create the solve analysis information */
399: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
401: /* set the operation */
402: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
404: /* set the matrix */
405: upTriFactor->csrMat = new CsrMatrix;
406: upTriFactor->csrMat->num_rows = n;
407: upTriFactor->csrMat->num_cols = n;
408: upTriFactor->csrMat->num_entries = nzUpper;
410: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
413: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
416: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
419: /* perform the solve analysis */
420: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
425: /* assign the pointer. Is this really necessary? */
426: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
428: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
432: } catch(char *ex) {
433: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434: }
435: }
436: return(0);
437: }
439: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440: {
441: PetscErrorCode ierr;
442: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
443: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444: IS isrow = a->row,iscol = a->icol;
445: PetscBool row_identity,col_identity;
446: const PetscInt *r,*c;
447: PetscInt n = A->rmap->n;
450: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
451: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
453: cusparseTriFactors->workVector = new THRUSTARRAY(n);
454: cusparseTriFactors->nnz=a->nz;
456: A->offloadmask = PETSC_OFFLOAD_BOTH;
457: /* lower triangular indices */
458: ISGetIndices(isrow,&r);
459: ISIdentity(isrow,&row_identity);
460: if (!row_identity) {
461: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462: cusparseTriFactors->rpermIndices->assign(r, r+n);
463: }
464: ISRestoreIndices(isrow,&r);
466: /* upper triangular indices */
467: ISGetIndices(iscol,&c);
468: ISIdentity(iscol,&col_identity);
469: if (!col_identity) {
470: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471: cusparseTriFactors->cpermIndices->assign(c, c+n);
472: }
474: if (!row_identity && !col_identity) {
475: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
476: } else if(!row_identity) {
477: PetscLogCpuToGpu(n*sizeof(PetscInt));
478: } else if(!col_identity) {
479: PetscLogCpuToGpu(n*sizeof(PetscInt));
480: }
482: ISRestoreIndices(iscol,&c);
483: return(0);
484: }
486: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487: {
488: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
489: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492: cusparseStatus_t stat;
493: PetscErrorCode ierr;
494: PetscInt *AiUp, *AjUp;
495: PetscScalar *AAUp;
496: PetscScalar *AALo;
497: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
499: const PetscInt *ai = b->i,*aj = b->j,*vj;
500: const MatScalar *aa = b->a,*v;
503: if (!n) return(0);
504: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505: try {
506: /* Allocate Space for the upper triangular matrix */
507: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
512: /* Fill the upper triangular matrix */
513: AiUp[0]=(PetscInt) 0;
514: AiUp[n]=nzUpper;
515: offset = 0;
516: for (i=0; i<n; i++) {
517: /* set the pointers */
518: v = aa + ai[i];
519: vj = aj + ai[i];
520: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
522: /* first, set the diagonal elements */
523: AjUp[offset] = (PetscInt) i;
524: AAUp[offset] = (MatScalar)1.0/v[nz];
525: AiUp[i] = offset;
526: AALo[offset] = (MatScalar)1.0/v[nz];
528: offset+=1;
529: if (nz>0) {
530: PetscArraycpy(&(AjUp[offset]), vj, nz);
531: PetscArraycpy(&(AAUp[offset]), v, nz);
532: for (j=offset; j<offset+nz; j++) {
533: AAUp[j] = -AAUp[j];
534: AALo[j] = AAUp[j]/v[nz];
535: }
536: offset+=nz;
537: }
538: }
540: /* allocate space for the triangular factor information */
541: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
543: /* Create the matrix description */
544: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
550: /* Create the solve analysis information */
551: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
553: /* set the operation */
554: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
556: /* set the matrix */
557: upTriFactor->csrMat = new CsrMatrix;
558: upTriFactor->csrMat->num_rows = A->rmap->n;
559: upTriFactor->csrMat->num_cols = A->cmap->n;
560: upTriFactor->csrMat->num_entries = a->nz;
562: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
565: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
568: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
571: /* perform the solve analysis */
572: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
577: /* assign the pointer. Is this really necessary? */
578: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
580: /* allocate space for the triangular factor information */
581: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
583: /* Create the matrix description */
584: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
590: /* Create the solve analysis information */
591: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
593: /* set the operation */
594: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
596: /* set the matrix */
597: loTriFactor->csrMat = new CsrMatrix;
598: loTriFactor->csrMat->num_rows = A->rmap->n;
599: loTriFactor->csrMat->num_cols = A->cmap->n;
600: loTriFactor->csrMat->num_entries = a->nz;
602: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
605: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
608: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
612: /* perform the solve analysis */
613: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
618: /* assign the pointer. Is this really necessary? */
619: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
621: A->offloadmask = PETSC_OFFLOAD_BOTH;
622: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625: cudaFreeHost(AALo);CHKERRCUDA(ierr);
626: } catch(char *ex) {
627: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628: }
629: }
630: return(0);
631: }
633: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634: {
635: PetscErrorCode ierr;
636: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
637: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638: IS ip = a->row;
639: const PetscInt *rip;
640: PetscBool perm_identity;
641: PetscInt n = A->rmap->n;
644: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
645: cusparseTriFactors->workVector = new THRUSTARRAY(n);
646: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
648: /* lower triangular indices */
649: ISGetIndices(ip,&rip);
650: ISIdentity(ip,&perm_identity);
651: if (!perm_identity) {
652: IS iip;
653: const PetscInt *irip;
655: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
656: ISGetIndices(iip,&irip);
657: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661: ISRestoreIndices(iip,&irip);
662: ISDestroy(&iip);
663: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
664: }
665: ISRestoreIndices(ip,&rip);
666: return(0);
667: }
669: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670: {
671: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
672: IS isrow = b->row,iscol = b->col;
673: PetscBool row_identity,col_identity;
677: MatLUFactorNumeric_SeqAIJ(B,A,info);
678: /* determine which version of MatSolve needs to be used. */
679: ISIdentity(isrow,&row_identity);
680: ISIdentity(iscol,&col_identity);
681: if (row_identity && col_identity) {
682: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684: B->ops->matsolve = NULL;
685: B->ops->matsolvetranspose = NULL;
686: } else {
687: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689: B->ops->matsolve = NULL;
690: B->ops->matsolvetranspose = NULL;
691: }
693: /* get the triangular factors */
694: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
695: return(0);
696: }
698: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699: {
700: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
701: IS ip = b->row;
702: PetscBool perm_identity;
706: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
708: /* determine which version of MatSolve needs to be used. */
709: ISIdentity(ip,&perm_identity);
710: if (perm_identity) {
711: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713: B->ops->matsolve = NULL;
714: B->ops->matsolvetranspose = NULL;
715: } else {
716: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718: B->ops->matsolve = NULL;
719: B->ops->matsolvetranspose = NULL;
720: }
722: /* get the triangular factors */
723: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
724: return(0);
725: }
727: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728: {
729: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734: cusparseStatus_t stat;
735: cusparseIndexBase_t indexBase;
736: cusparseMatrixType_t matrixType;
737: cusparseFillMode_t fillMode;
738: cusparseDiagType_t diagType;
742: /*********************************************/
743: /* Now the Transpose of the Lower Tri Factor */
744: /*********************************************/
746: /* allocate space for the transpose of the lower triangular factor */
747: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
749: /* set the matrix descriptors of the lower triangular factor */
750: matrixType = cusparseGetMatType(loTriFactor->descr);
751: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754: diagType = cusparseGetMatDiagType(loTriFactor->descr);
756: /* Create the matrix description */
757: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
763: /* Create the solve analysis information */
764: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
766: /* set the operation */
767: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
769: /* allocate GPU space for the CSC of the lower triangular factor*/
770: loTriFactorT->csrMat = new CsrMatrix;
771: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
778: /* compute the transpose of the lower triangular factor, i.e. the CSC */
779: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781: loTriFactor->csrMat->values->data().get(),
782: loTriFactor->csrMat->row_offsets->data().get(),
783: loTriFactor->csrMat->column_indices->data().get(),
784: loTriFactorT->csrMat->values->data().get(),
785: loTriFactorT->csrMat->column_indices->data().get(),
786: loTriFactorT->csrMat->row_offsets->data().get(),
787: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
789: /* perform the solve analysis on the transposed matrix */
790: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794: loTriFactorT->solveInfo);CHKERRCUDA(stat);
796: /* assign the pointer. Is this really necessary? */
797: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
799: /*********************************************/
800: /* Now the Transpose of the Upper Tri Factor */
801: /*********************************************/
803: /* allocate space for the transpose of the upper triangular factor */
804: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
806: /* set the matrix descriptors of the upper triangular factor */
807: matrixType = cusparseGetMatType(upTriFactor->descr);
808: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811: diagType = cusparseGetMatDiagType(upTriFactor->descr);
813: /* Create the matrix description */
814: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
820: /* Create the solve analysis information */
821: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
823: /* set the operation */
824: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
826: /* allocate GPU space for the CSC of the upper triangular factor*/
827: upTriFactorT->csrMat = new CsrMatrix;
828: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
835: /* compute the transpose of the upper triangular factor, i.e. the CSC */
836: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838: upTriFactor->csrMat->values->data().get(),
839: upTriFactor->csrMat->row_offsets->data().get(),
840: upTriFactor->csrMat->column_indices->data().get(),
841: upTriFactorT->csrMat->values->data().get(),
842: upTriFactorT->csrMat->column_indices->data().get(),
843: upTriFactorT->csrMat->row_offsets->data().get(),
844: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
846: /* perform the solve analysis on the transposed matrix */
847: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851: upTriFactorT->solveInfo);CHKERRCUDA(stat);
853: /* assign the pointer. Is this really necessary? */
854: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855: return(0);
856: }
858: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859: {
860: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
864: cusparseStatus_t stat;
865: cusparseIndexBase_t indexBase;
866: cudaError_t err;
867: PetscErrorCode ierr;
871: /* allocate space for the triangular factor information */
872: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874: indexBase = cusparseGetMatIndexBase(matstruct->descr);
875: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
878: /* set alpha and beta */
879: err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
880: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882: err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
887: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889: CsrMatrix *matrixT= new CsrMatrix;
890: matrixT->num_rows = A->cmap->n;
891: matrixT->num_cols = A->rmap->n;
892: matrixT->num_entries = a->nz;
893: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
894: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
895: matrixT->values = new THRUSTARRAY(a->nz);
897: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
898: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
899: /* compute the transpose, i.e. the CSC */
900: indexBase = cusparseGetMatIndexBase(matstruct->descr);
901: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
902: A->cmap->n, matrix->num_entries,
903: matrix->values->data().get(),
904: cusparsestruct->rowoffsets_gpu->data().get(),
905: matrix->column_indices->data().get(),
906: matrixT->values->data().get(),
907: matrixT->column_indices->data().get(),
908: matrixT->row_offsets->data().get(),
909: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
910: /* assign the pointer */
911: matstructT->mat = matrixT;
912: PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
913: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
914: /* First convert HYB to CSR */
915: CsrMatrix *temp= new CsrMatrix;
916: temp->num_rows = A->rmap->n;
917: temp->num_cols = A->cmap->n;
918: temp->num_entries = a->nz;
919: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920: temp->column_indices = new THRUSTINTARRAY32(a->nz);
921: temp->values = new THRUSTARRAY(a->nz);
924: stat = cusparse_hyb2csr(cusparsestruct->handle,
925: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
926: temp->values->data().get(),
927: temp->row_offsets->data().get(),
928: temp->column_indices->data().get());CHKERRCUDA(stat);
930: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
931: CsrMatrix *tempT= new CsrMatrix;
932: tempT->num_rows = A->rmap->n;
933: tempT->num_cols = A->cmap->n;
934: tempT->num_entries = a->nz;
935: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
937: tempT->values = new THRUSTARRAY(a->nz);
939: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
940: temp->num_cols, temp->num_entries,
941: temp->values->data().get(),
942: temp->row_offsets->data().get(),
943: temp->column_indices->data().get(),
944: tempT->values->data().get(),
945: tempT->column_indices->data().get(),
946: tempT->row_offsets->data().get(),
947: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
949: /* Last, convert CSC to HYB */
950: cusparseHybMat_t hybMat;
951: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
952: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
953: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
954: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
955: matstructT->descr, tempT->values->data().get(),
956: tempT->row_offsets->data().get(),
957: tempT->column_indices->data().get(),
958: hybMat, 0, partition);CHKERRCUDA(stat);
960: /* assign the pointer */
961: matstructT->mat = hybMat;
962: PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));
964: /* delete temporaries */
965: if (tempT) {
966: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
967: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
968: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
969: delete (CsrMatrix*) tempT;
970: }
971: if (temp) {
972: if (temp->values) delete (THRUSTARRAY*) temp->values;
973: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
974: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
975: delete (CsrMatrix*) temp;
976: }
977: }
978: /* assign the compressed row indices */
979: matstructT->cprowIndices = new THRUSTINTARRAY;
980: matstructT->cprowIndices->resize(A->cmap->n);
981: thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
982: /* assign the pointer */
983: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
984: return(0);
985: }
987: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
988: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
989: {
990: PetscInt n = xx->map->n;
991: const PetscScalar *barray;
992: PetscScalar *xarray;
993: thrust::device_ptr<const PetscScalar> bGPU;
994: thrust::device_ptr<PetscScalar> xGPU;
995: cusparseStatus_t stat;
996: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
997: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
998: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
999: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1000: PetscErrorCode ierr;
1003: /* Analyze the matrix and create the transpose ... on the fly */
1004: if (!loTriFactorT && !upTriFactorT) {
1005: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1006: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008: }
1010: /* Get the GPU pointers */
1011: VecCUDAGetArrayWrite(xx,&xarray);
1012: VecCUDAGetArrayRead(bb,&barray);
1013: xGPU = thrust::device_pointer_cast(xarray);
1014: bGPU = thrust::device_pointer_cast(barray);
1016: PetscLogGpuTimeBegin();
1017: /* First, reorder with the row permutation */
1018: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1019: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1020: xGPU);
1022: /* First, solve U */
1023: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1024: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1025: upTriFactorT->csrMat->values->data().get(),
1026: upTriFactorT->csrMat->row_offsets->data().get(),
1027: upTriFactorT->csrMat->column_indices->data().get(),
1028: upTriFactorT->solveInfo,
1029: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1031: /* Then, solve L */
1032: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1033: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1034: loTriFactorT->csrMat->values->data().get(),
1035: loTriFactorT->csrMat->row_offsets->data().get(),
1036: loTriFactorT->csrMat->column_indices->data().get(),
1037: loTriFactorT->solveInfo,
1038: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1040: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1041: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1042: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1043: tempGPU->begin());
1045: /* Copy the temporary to the full solution. */
1046: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1048: /* restore */
1049: VecCUDARestoreArrayRead(bb,&barray);
1050: VecCUDARestoreArrayWrite(xx,&xarray);
1051: WaitForGPU();CHKERRCUDA(ierr);
1052: PetscLogGpuTimeEnd();
1053: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1054: return(0);
1055: }
1057: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058: {
1059: const PetscScalar *barray;
1060: PetscScalar *xarray;
1061: cusparseStatus_t stat;
1062: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066: PetscErrorCode ierr;
1069: /* Analyze the matrix and create the transpose ... on the fly */
1070: if (!loTriFactorT && !upTriFactorT) {
1071: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1072: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074: }
1076: /* Get the GPU pointers */
1077: VecCUDAGetArrayWrite(xx,&xarray);
1078: VecCUDAGetArrayRead(bb,&barray);
1080: PetscLogGpuTimeBegin();
1081: /* First, solve U */
1082: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1083: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1084: upTriFactorT->csrMat->values->data().get(),
1085: upTriFactorT->csrMat->row_offsets->data().get(),
1086: upTriFactorT->csrMat->column_indices->data().get(),
1087: upTriFactorT->solveInfo,
1088: barray, tempGPU->data().get());CHKERRCUDA(stat);
1090: /* Then, solve L */
1091: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1093: loTriFactorT->csrMat->values->data().get(),
1094: loTriFactorT->csrMat->row_offsets->data().get(),
1095: loTriFactorT->csrMat->column_indices->data().get(),
1096: loTriFactorT->solveInfo,
1097: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1099: /* restore */
1100: VecCUDARestoreArrayRead(bb,&barray);
1101: VecCUDARestoreArrayWrite(xx,&xarray);
1102: WaitForGPU();CHKERRCUDA(ierr);
1103: PetscLogGpuTimeEnd();
1104: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1105: return(0);
1106: }
1108: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109: {
1110: const PetscScalar *barray;
1111: PetscScalar *xarray;
1112: thrust::device_ptr<const PetscScalar> bGPU;
1113: thrust::device_ptr<PetscScalar> xGPU;
1114: cusparseStatus_t stat;
1115: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1116: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1117: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1118: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1119: PetscErrorCode ierr;
1123: /* Get the GPU pointers */
1124: VecCUDAGetArrayWrite(xx,&xarray);
1125: VecCUDAGetArrayRead(bb,&barray);
1126: xGPU = thrust::device_pointer_cast(xarray);
1127: bGPU = thrust::device_pointer_cast(barray);
1129: PetscLogGpuTimeBegin();
1130: /* First, reorder with the row permutation */
1131: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1132: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1133: tempGPU->begin());
1135: /* Next, solve L */
1136: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1137: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1138: loTriFactor->csrMat->values->data().get(),
1139: loTriFactor->csrMat->row_offsets->data().get(),
1140: loTriFactor->csrMat->column_indices->data().get(),
1141: loTriFactor->solveInfo,
1142: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1144: /* Then, solve U */
1145: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1146: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1147: upTriFactor->csrMat->values->data().get(),
1148: upTriFactor->csrMat->row_offsets->data().get(),
1149: upTriFactor->csrMat->column_indices->data().get(),
1150: upTriFactor->solveInfo,
1151: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1153: /* Last, reorder with the column permutation */
1154: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1155: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1156: xGPU);
1158: VecCUDARestoreArrayRead(bb,&barray);
1159: VecCUDARestoreArrayWrite(xx,&xarray);
1160: WaitForGPU();CHKERRCUDA(ierr);
1161: PetscLogGpuTimeEnd();
1162: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1163: return(0);
1164: }
1166: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1167: {
1168: const PetscScalar *barray;
1169: PetscScalar *xarray;
1170: cusparseStatus_t stat;
1171: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1172: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1173: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1174: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1175: PetscErrorCode ierr;
1178: /* Get the GPU pointers */
1179: VecCUDAGetArrayWrite(xx,&xarray);
1180: VecCUDAGetArrayRead(bb,&barray);
1182: PetscLogGpuTimeBegin();
1183: /* First, solve L */
1184: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1186: loTriFactor->csrMat->values->data().get(),
1187: loTriFactor->csrMat->row_offsets->data().get(),
1188: loTriFactor->csrMat->column_indices->data().get(),
1189: loTriFactor->solveInfo,
1190: barray, tempGPU->data().get());CHKERRCUDA(stat);
1192: /* Next, solve U */
1193: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1195: upTriFactor->csrMat->values->data().get(),
1196: upTriFactor->csrMat->row_offsets->data().get(),
1197: upTriFactor->csrMat->column_indices->data().get(),
1198: upTriFactor->solveInfo,
1199: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1201: VecCUDARestoreArrayRead(bb,&barray);
1202: VecCUDARestoreArrayWrite(xx,&xarray);
1203: WaitForGPU();CHKERRCUDA(ierr);
1204: PetscLogGpuTimeEnd();
1205: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1206: return(0);
1207: }
1209: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1210: {
1211: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1212: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1213: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1214: PetscInt m = A->rmap->n,*ii,*ridx;
1215: PetscErrorCode ierr;
1216: cusparseStatus_t stat;
1217: cudaError_t err;
1220: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1221: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1222: if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1223: /* Copy values only */
1224: CsrMatrix *mat,*matT;
1225: mat = (CsrMatrix*)cusparsestruct->mat->mat;
1226: mat->values->assign(a->a, a->a+a->nz);
1227: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1229: /* Update matT when it was built before */
1230: if (cusparsestruct->matTranspose) {
1231: cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1232: matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1233: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1234: A->cmap->n, mat->num_entries,
1235: mat->values->data().get(),
1236: cusparsestruct->rowoffsets_gpu->data().get(),
1237: mat->column_indices->data().get(),
1238: matT->values->data().get(),
1239: matT->column_indices->data().get(),
1240: matT->row_offsets->data().get(),
1241: CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUDA(stat);
1242: }
1243: } else {
1244: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1245: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1246: delete cusparsestruct->workVector;
1247: delete cusparsestruct->rowoffsets_gpu;
1248: try {
1249: cusparsestruct->nonzerorow=0;
1250: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1252: if (a->compressedrow.use) {
1253: m = a->compressedrow.nrows;
1254: ii = a->compressedrow.i;
1255: ridx = a->compressedrow.rindex;
1256: } else {
1257: /* Forcing compressed row on the GPU */
1258: int k=0;
1259: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1260: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1261: ii[0]=0;
1262: for (int j = 0; j<m; j++) {
1263: if ((a->i[j+1]-a->i[j])>0) {
1264: ii[k] = a->i[j];
1265: ridx[k]= j;
1266: k++;
1267: }
1268: }
1269: ii[cusparsestruct->nonzerorow] = a->nz;
1270: m = cusparsestruct->nonzerorow;
1271: }
1273: /* allocate space for the triangular factor information */
1274: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1275: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1276: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1277: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1279: err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
1280: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1281: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1282: err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1284: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1285: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1287: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1288: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1289: /* set the matrix */
1290: CsrMatrix *matrix= new CsrMatrix;
1291: matrix->num_rows = m;
1292: matrix->num_cols = A->cmap->n;
1293: matrix->num_entries = a->nz;
1294: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1295: matrix->row_offsets->assign(ii, ii + m+1);
1297: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1298: matrix->column_indices->assign(a->j, a->j+a->nz);
1300: matrix->values = new THRUSTARRAY(a->nz);
1301: matrix->values->assign(a->a, a->a+a->nz);
1303: /* assign the pointer */
1304: matstruct->mat = matrix;
1306: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1307: CsrMatrix *matrix= new CsrMatrix;
1308: matrix->num_rows = m;
1309: matrix->num_cols = A->cmap->n;
1310: matrix->num_entries = a->nz;
1311: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1312: matrix->row_offsets->assign(ii, ii + m+1);
1314: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1315: matrix->column_indices->assign(a->j, a->j+a->nz);
1317: matrix->values = new THRUSTARRAY(a->nz);
1318: matrix->values->assign(a->a, a->a+a->nz);
1320: cusparseHybMat_t hybMat;
1321: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1322: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1323: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1324: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1325: matstruct->descr, matrix->values->data().get(),
1326: matrix->row_offsets->data().get(),
1327: matrix->column_indices->data().get(),
1328: hybMat, 0, partition);CHKERRCUDA(stat);
1329: /* assign the pointer */
1330: matstruct->mat = hybMat;
1332: if (matrix) {
1333: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1334: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1335: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1336: delete (CsrMatrix*)matrix;
1337: }
1338: }
1340: /* assign the compressed row indices */
1341: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1342: matstruct->cprowIndices->assign(ridx,ridx+m);
1343: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1345: /* assign the pointer */
1346: cusparsestruct->mat = matstruct;
1348: if (!a->compressedrow.use) {
1349: PetscFree(ii);
1350: PetscFree(ridx);
1351: }
1352: cusparsestruct->workVector = new THRUSTARRAY(m);
1353: } catch(char *ex) {
1354: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1355: }
1356: cusparsestruct->nonzerostate = A->nonzerostate;
1357: }
1358: WaitForGPU();CHKERRCUDA(ierr);
1359: A->offloadmask = PETSC_OFFLOAD_BOTH;
1360: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1361: }
1362: return(0);
1363: }
1365: struct VecCUDAPlusEquals
1366: {
1367: template <typename Tuple>
1368: __host__ __device__
1369: void operator()(Tuple t)
1370: {
1371: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372: }
1373: };
1375: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1376: {
1380: MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1381: return(0);
1382: }
1384: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1385: {
1386: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1387: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1388: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1389: const PetscScalar *xarray;
1390: PetscScalar *yarray;
1391: PetscErrorCode ierr;
1392: cusparseStatus_t stat;
1395: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1396: MatSeqAIJCUSPARSECopyToGPU(A);
1397: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398: if (!matstructT) {
1399: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1400: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1401: }
1402: VecCUDAGetArrayRead(xx,&xarray);
1403: VecCUDAGetArrayWrite(yy,&yarray);
1404: if (yy->map->n) {
1405: PetscInt n = yy->map->n;
1406: cudaError_t err;
1407: err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1408: }
1410: PetscLogGpuTimeBegin();
1411: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414: mat->num_rows, mat->num_cols,
1415: mat->num_entries, matstructT->alpha, matstructT->descr,
1416: mat->values->data().get(), mat->row_offsets->data().get(),
1417: mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1418: yarray);CHKERRCUDA(stat);
1419: } else {
1420: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1421: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422: matstructT->alpha, matstructT->descr, hybMat,
1423: xarray, matstructT->beta_zero,
1424: yarray);CHKERRCUDA(stat);
1425: }
1426: VecCUDARestoreArrayRead(xx,&xarray);
1427: VecCUDARestoreArrayWrite(yy,&yarray);
1428: WaitForGPU();CHKERRCUDA(ierr);
1429: PetscLogGpuTimeEnd();
1430: PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1431: return(0);
1432: }
1435: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1436: {
1437: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1438: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1439: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1440: const PetscScalar *xarray;
1441: PetscScalar *zarray,*dptr,*beta;
1442: PetscErrorCode ierr;
1443: cusparseStatus_t stat;
1444: PetscBool cmpr; /* if the matrix has been compressed (zero rows) */
1447: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1448: MatSeqAIJCUSPARSECopyToGPU(A);
1449: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1450: try {
1451: cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1452: VecCUDAGetArrayRead(xx,&xarray);
1453: if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1454: VecCUDAGetArray(zz,&zarray);
1455: } else {
1456: VecCUDAGetArrayWrite(zz,&zarray);
1457: }
1458: dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1459: beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1461: PetscLogGpuTimeBegin();
1462: /* csr_spmv is multiply add */
1463: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1464: /* here we need to be careful to set the number of rows in the multiply to the
1465: number of compressed rows in the matrix ... which is equivalent to the
1466: size of the workVector */
1467: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1468: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469: mat->num_rows, mat->num_cols,
1470: mat->num_entries, matstruct->alpha, matstruct->descr,
1471: mat->values->data().get(), mat->row_offsets->data().get(),
1472: mat->column_indices->data().get(), xarray, beta,
1473: dptr);CHKERRCUDA(stat);
1474: } else {
1475: if (cusparsestruct->workVector->size()) {
1476: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1477: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478: matstruct->alpha, matstruct->descr, hybMat,
1479: xarray, beta,
1480: dptr);CHKERRCUDA(stat);
1481: }
1482: }
1483: PetscLogGpuTimeEnd();
1485: if (yy) {
1486: if (dptr != zarray) {
1487: VecCopy_SeqCUDA(yy,zz);
1488: } else if (zz != yy) {
1489: VecAXPY_SeqCUDA(zz,1.0,yy);
1490: }
1491: } else if (dptr != zarray) {
1492: VecSet_SeqCUDA(zz,0);
1493: }
1494: /* scatter the data from the temporary into the full vector with a += operation */
1495: PetscLogGpuTimeBegin();
1496: if (dptr != zarray) {
1497: thrust::device_ptr<PetscScalar> zptr;
1499: zptr = thrust::device_pointer_cast(zarray);
1500: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1501: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1502: VecCUDAPlusEquals());
1503: }
1504: PetscLogGpuTimeEnd();
1505: VecCUDARestoreArrayRead(xx,&xarray);
1506: if (yy && !cmpr) {
1507: VecCUDARestoreArray(zz,&zarray);
1508: } else {
1509: VecCUDARestoreArrayWrite(zz,&zarray);
1510: }
1511: } catch(char *ex) {
1512: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1513: }
1514: WaitForGPU();CHKERRCUDA(ierr);
1515: PetscLogGpuFlops(2.0*a->nz);
1516: return(0);
1517: }
1519: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1520: {
1521: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1522: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1523: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1524: const PetscScalar *xarray;
1525: PetscScalar *zarray,*beta;
1526: PetscErrorCode ierr;
1527: cusparseStatus_t stat;
1530: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1531: MatSeqAIJCUSPARSECopyToGPU(A);
1532: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1533: if (!matstructT) {
1534: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1535: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536: }
1538: /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1539: try {
1540: VecCopy_SeqCUDA(yy,zz);
1541: VecCUDAGetArrayRead(xx,&xarray);
1542: VecCUDAGetArray(zz,&zarray);
1543: beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1545: PetscLogGpuTimeBegin();
1546: /* multiply add with matrix transpose */
1547: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1548: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1549: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1550: mat->num_rows, mat->num_cols,
1551: mat->num_entries, matstructT->alpha, matstructT->descr,
1552: mat->values->data().get(), mat->row_offsets->data().get(),
1553: mat->column_indices->data().get(), xarray, beta,
1554: zarray);CHKERRCUDA(stat);
1555: } else {
1556: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1557: if (cusparsestruct->workVector->size()) {
1558: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1559: matstructT->alpha, matstructT->descr, hybMat,
1560: xarray, beta,
1561: zarray);CHKERRCUDA(stat);
1562: }
1563: }
1564: PetscLogGpuTimeEnd();
1566: if (zz != yy) {VecAXPY_SeqCUDA(zz,1.0,yy);}
1567: VecCUDARestoreArrayRead(xx,&xarray);
1568: VecCUDARestoreArray(zz,&zarray);
1569: } catch(char *ex) {
1570: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1571: }
1572: WaitForGPU();CHKERRCUDA(ierr);
1573: PetscLogGpuFlops(2.0*a->nz);
1574: return(0);
1575: }
1577: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1578: {
1582: MatAssemblyEnd_SeqAIJ(A,mode);
1583: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1584: if (A->factortype == MAT_FACTOR_NONE) {
1585: MatSeqAIJCUSPARSECopyToGPU(A);
1586: }
1587: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1588: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1589: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1590: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1591: return(0);
1592: }
1594: /* --------------------------------------------------------------------------------*/
1595: /*@
1596: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1597: (the default parallel PETSc format). This matrix will ultimately pushed down
1598: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1599: assembly performance the user should preallocate the matrix storage by setting
1600: the parameter nz (or the array nnz). By setting these parameters accurately,
1601: performance during matrix assembly can be increased by more than a factor of 50.
1603: Collective
1605: Input Parameters:
1606: + comm - MPI communicator, set to PETSC_COMM_SELF
1607: . m - number of rows
1608: . n - number of columns
1609: . nz - number of nonzeros per row (same for all rows)
1610: - nnz - array containing the number of nonzeros in the various rows
1611: (possibly different for each row) or NULL
1613: Output Parameter:
1614: . A - the matrix
1616: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1617: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1618: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1620: Notes:
1621: If nnz is given then nz is ignored
1623: The AIJ format (also called the Yale sparse matrix format or
1624: compressed row storage), is fully compatible with standard Fortran 77
1625: storage. That is, the stored row and column indices can begin at
1626: either one (as in Fortran) or zero. See the users' manual for details.
1628: Specify the preallocated storage with either nz or nnz (not both).
1629: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1630: allocation. For large problems you MUST preallocate memory or you
1631: will get TERRIBLE performance, see the users' manual chapter on matrices.
1633: By default, this format uses inodes (identical nodes) when possible, to
1634: improve numerical efficiency of matrix-vector products and solves. We
1635: search for consecutive rows with the same nonzero structure, thereby
1636: reusing matrix information to achieve increased efficiency.
1638: Level: intermediate
1640: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1641: @*/
1642: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1643: {
1647: MatCreate(comm,A);
1648: MatSetSizes(*A,m,n,m,n);
1649: MatSetType(*A,MATSEQAIJCUSPARSE);
1650: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1651: return(0);
1652: }
1654: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1655: {
1656: PetscErrorCode ierr;
1659: if (A->factortype==MAT_FACTOR_NONE) {
1660: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1661: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1662: }
1663: } else {
1664: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1665: }
1666: MatDestroy_SeqAIJ(A);
1667: return(0);
1668: }
1670: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1671: {
1673: Mat C;
1674: cusparseStatus_t stat;
1675: cusparseHandle_t handle=0;
1678: MatDuplicate_SeqAIJ(A,cpvalues,B);
1679: C = *B;
1680: PetscFree(C->defaultvectype);
1681: PetscStrallocpy(VECCUDA,&C->defaultvectype);
1683: /* inject CUSPARSE-specific stuff */
1684: if (C->factortype==MAT_FACTOR_NONE) {
1685: /* you cannot check the inode.use flag here since the matrix was just created.
1686: now build a GPU matrix data structure */
1687: C->spptr = new Mat_SeqAIJCUSPARSE;
1688: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1689: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1690: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1691: ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1692: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1693: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1694: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1695: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1696: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1697: } else {
1698: /* NEXT, set the pointers to the triangular factors */
1699: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1700: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1701: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1702: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1703: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1704: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1705: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1706: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1707: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1708: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1709: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1710: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1711: }
1713: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1714: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1715: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1716: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1717: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1718: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1719: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1720: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1722: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1724: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1726: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1727: return(0);
1728: }
1730: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1731: {
1733: cusparseStatus_t stat;
1734: cusparseHandle_t handle=0;
1737: PetscFree(B->defaultvectype);
1738: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1740: if (B->factortype==MAT_FACTOR_NONE) {
1741: /* you cannot check the inode.use flag here since the matrix was just created.
1742: now build a GPU matrix data structure */
1743: B->spptr = new Mat_SeqAIJCUSPARSE;
1744: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1745: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1746: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1747: ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1748: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1749: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1750: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1751: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1752: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1753: } else {
1754: /* NEXT, set the pointers to the triangular factors */
1755: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1756: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1757: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1758: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1759: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1760: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1761: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1762: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1763: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1764: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1765: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1766: }
1768: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1769: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1770: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1771: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1772: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1773: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1774: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1775: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1777: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1779: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1781: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1782: return(0);
1783: }
1785: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1786: {
1790: MatCreate_SeqAIJ(B);
1791: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1792: return(0);
1793: }
1795: /*MC
1796: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1798: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1799: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1800: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1802: Options Database Keys:
1803: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1804: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1805: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1807: Level: beginner
1809: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1810: M*/
1812: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1815: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1816: {
1820: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1821: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1822: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1823: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1824: return(0);
1825: }
1828: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1829: {
1830: cusparseStatus_t stat;
1831: cusparseHandle_t handle;
1834: if (*cusparsestruct) {
1835: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1836: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1837: delete (*cusparsestruct)->workVector;
1838: delete (*cusparsestruct)->rowoffsets_gpu;
1839: if (handle = (*cusparsestruct)->handle) {
1840: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1841: }
1842: delete *cusparsestruct;
1843: *cusparsestruct = 0;
1844: }
1845: return(0);
1846: }
1848: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1849: {
1851: if (*mat) {
1852: delete (*mat)->values;
1853: delete (*mat)->column_indices;
1854: delete (*mat)->row_offsets;
1855: delete *mat;
1856: *mat = 0;
1857: }
1858: return(0);
1859: }
1861: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1862: {
1863: cusparseStatus_t stat;
1864: PetscErrorCode ierr;
1867: if (*trifactor) {
1868: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1869: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1870: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1871: delete *trifactor;
1872: *trifactor = 0;
1873: }
1874: return(0);
1875: }
1877: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1878: {
1879: CsrMatrix *mat;
1880: cusparseStatus_t stat;
1881: cudaError_t err;
1884: if (*matstruct) {
1885: if ((*matstruct)->mat) {
1886: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1887: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1888: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1889: } else {
1890: mat = (CsrMatrix*)(*matstruct)->mat;
1891: CsrMatrix_Destroy(&mat);
1892: }
1893: }
1894: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1895: delete (*matstruct)->cprowIndices;
1896: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1897: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1898: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1899: delete *matstruct;
1900: *matstruct = 0;
1901: }
1902: return(0);
1903: }
1905: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1906: {
1907: cusparseHandle_t handle;
1908: cusparseStatus_t stat;
1911: if (*trifactors) {
1912: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1913: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1914: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1915: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1916: delete (*trifactors)->rpermIndices;
1917: delete (*trifactors)->cpermIndices;
1918: delete (*trifactors)->workVector;
1919: if (handle = (*trifactors)->handle) {
1920: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1921: }
1922: delete *trifactors;
1923: *trifactors = 0;
1924: }
1925: return(0);
1926: }