Actual source code: densecuda.cu
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsc/private/cudavecimpl.h>
9: #if defined(PETSC_USE_COMPLEX)
10: #if defined(PETSC_USE_REAL_SINGLE)
11: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
12: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
13: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
14: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
18: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
19: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
20: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
21: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnCgeqrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
22: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnCgeqrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(cuComplex*)(g),(h),(i))
23: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnCunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(l))
24: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnCunmqr((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(cuComplex*)(l),(m),(n))
25: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasCtrsm((a),(b),(c),(d),(e),(f),(g),(cuComplex*)(h),(cuComplex*)(i),(j),(cuComplex*)(k),(l))
26: #else /* complex double */
27: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
28: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
29: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
30: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
31: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
32: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
33: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
34: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
35: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
36: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
37: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnZgeqrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
38: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnZgeqrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(cuDoubleComplex*)(g),(h),(i))
39: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnZunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(l))
40: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnZunmqr((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(m),(n))
41: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasZtrsm((a),(b),(c),(d),(e),(f),(g),(cuDoubleComplex*)(h),(cuDoubleComplex*)(i),(j),(cuDoubleComplex*)(k),(l))
42: #endif
43: #else /* real single */
44: #if defined(PETSC_USE_REAL_SINGLE)
45: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
48: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
49: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
50: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
51: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
52: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
55: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnSgeqrf_bufferSize((a),(b),(c),(float*)(d),(e),(f))
56: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnSgeqrf((a),(b),(c),(float*)(d),(e),(float*)(f),(float*)(g),(h),(i))
57: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnSormqr_bufferSize((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(l))
58: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnSormqr((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(float*)(l),(m),(n))
59: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasStrsm((a),(b),(c),(d),(e),(f),(g),(float*)(h),(float*)(i),(j),(float*)(k),(l))
60: #else /* real double */
61: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
62: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
63: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
64: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
65: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
66: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
67: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
68: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
69: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
70: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
71: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnDgeqrf_bufferSize((a),(b),(c),(double*)(d),(e),(f))
72: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnDgeqrf((a),(b),(c),(double*)(d),(e),(double*)(f),(double*)(g),(h),(i))
73: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnDormqr_bufferSize((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(l))
74: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnDormqr((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(double*)(l),(m),(n))
75: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasDtrsm((a),(b),(c),(d),(e),(f),(g),(double*)(h),(double*)(i),(j),(double*)(k),(l))
76: #endif
77: #endif
79: typedef struct {
80: PetscScalar *d_v; /* pointer to the matrix on the GPU */
81: PetscBool user_alloc;
82: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
83: PetscBool unplaced_user_alloc;
84: /* factorization support */
85: PetscCuBLASInt *d_fact_ipiv; /* device pivots */
86: PetscScalar *d_fact_tau; /* device QR tau vector */
87: PetscScalar *d_fact_work; /* device workspace */
88: PetscCuBLASInt fact_lwork;
89: PetscCuBLASInt *d_fact_info; /* device info */
90: /* workspace */
91: Vec workvec;
92: } Mat_SeqDenseCUDA;
94: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
95: {
96: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
97: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
98: PetscBool iscuda;
100: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
101: if (!iscuda) return 0;
102: PetscLayoutSetUp(A->rmap);
103: PetscLayoutSetUp(A->cmap);
104: /* it may happen CPU preallocation has not been performed */
105: if (cA->lda <= 0) cA->lda = A->rmap->n;
106: if (!dA->user_alloc) cudaFree(dA->d_v);
107: if (!d_data) { /* petsc-allocated storage */
108: size_t sz;
110: PetscIntMultError(cA->lda,A->cmap->n,NULL);
111: sz = cA->lda*A->cmap->n*sizeof(PetscScalar);
112: cudaMalloc((void**)&dA->d_v,sz);
113: cudaMemset(dA->d_v,0,sz);
114: dA->user_alloc = PETSC_FALSE;
115: } else { /* user-allocated storage */
116: dA->d_v = d_data;
117: dA->user_alloc = PETSC_TRUE;
118: }
119: A->offloadmask = PETSC_OFFLOAD_GPU;
120: A->preallocated = PETSC_TRUE;
121: A->assembled = PETSC_TRUE;
122: return 0;
123: }
125: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
126: {
127: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
128: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
131: PetscInfo(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
132: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
133: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
134: MatSeqDenseSetPreallocation(A,NULL);
135: }
136: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
137: if (cA->lda > A->rmap->n) {
138: cudaMemcpy2D(cA->v,cA->lda*sizeof(PetscScalar),dA->d_v,cA->lda*sizeof(PetscScalar),A->rmap->n*sizeof(PetscScalar),A->cmap->n,cudaMemcpyDeviceToHost);
139: } else {
140: cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);
141: }
142: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
143: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
145: A->offloadmask = PETSC_OFFLOAD_BOTH;
146: }
147: return 0;
148: }
150: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
151: {
152: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
153: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
154: PetscBool copy;
157: if (A->boundtocpu) return 0;
158: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
159: PetscInfo(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
160: if (copy) {
161: if (!dA->d_v) { /* Allocate GPU memory if not present */
162: MatSeqDenseCUDASetPreallocation(A,NULL);
163: }
164: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
165: if (cA->lda > A->rmap->n) {
166: cudaMemcpy2D(dA->d_v,cA->lda*sizeof(PetscScalar),cA->v,cA->lda*sizeof(PetscScalar),A->rmap->n*sizeof(PetscScalar),A->cmap->n,cudaMemcpyHostToDevice);
167: } else {
168: cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);
169: }
170: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
171: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
173: A->offloadmask = PETSC_OFFLOAD_BOTH;
174: }
175: return 0;
176: }
178: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
179: {
180: const PetscScalar *va;
181: PetscScalar *vb;
182: PetscInt lda1,lda2,m=A->rmap->n,n=A->cmap->n;
184: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
185: if (A->ops->copy != B->ops->copy) {
186: MatCopy_Basic(A,B,str);
187: return 0;
188: }
190: MatDenseCUDAGetArrayRead(A,&va);
191: MatDenseCUDAGetArrayWrite(B,&vb);
192: MatDenseGetLDA(A,&lda1);
193: MatDenseGetLDA(B,&lda2);
194: PetscLogGpuTimeBegin();
195: if (lda1>m || lda2>m) {
196: cudaMemcpy2D(vb,lda2*sizeof(PetscScalar),va,lda1*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
197: } else {
198: cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);
199: }
200: PetscLogGpuTimeEnd();
201: MatDenseCUDARestoreArrayWrite(B,&vb);
202: MatDenseCUDARestoreArrayRead(A,&va);
203: return 0;
204: }
206: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
207: {
208: PetscScalar *va;
209: PetscInt lda,m = A->rmap->n,n = A->cmap->n;
211: MatDenseCUDAGetArrayWrite(A,&va);
212: MatDenseGetLDA(A,&lda);
213: PetscLogGpuTimeBegin();
214: if (lda>m) {
215: cudaMemset2D(va,lda*sizeof(PetscScalar),0,m*sizeof(PetscScalar),n);
216: } else {
217: cudaMemset(va,0,m*(n*sizeof(PetscScalar)));
218: }
219: PetscLogGpuTimeEnd();
220: MatDenseCUDARestoreArrayWrite(A,&va);
221: return 0;
222: }
224: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
225: {
226: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
227: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
232: if (aa->v) MatSeqDenseCUDACopyToGPU(A);
233: dA->unplacedarray = dA->d_v;
234: dA->unplaced_user_alloc = dA->user_alloc;
235: dA->d_v = (PetscScalar*)a;
236: dA->user_alloc = PETSC_TRUE;
237: return 0;
238: }
240: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
241: {
242: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
243: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
247: if (a->v) MatSeqDenseCUDACopyToGPU(A);
248: dA->d_v = dA->unplacedarray;
249: dA->user_alloc = dA->unplaced_user_alloc;
250: dA->unplacedarray = NULL;
251: return 0;
252: }
254: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
255: {
256: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
257: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
262: if (!dA->user_alloc) cudaFree(dA->d_v);
263: dA->d_v = (PetscScalar*)a;
264: dA->user_alloc = PETSC_FALSE;
265: return 0;
266: }
268: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
269: {
270: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
272: if (!dA->d_v) {
273: MatSeqDenseCUDASetPreallocation(A,NULL);
274: }
275: *a = dA->d_v;
276: return 0;
277: }
279: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
280: {
281: if (a) *a = NULL;
282: return 0;
283: }
285: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
286: {
287: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
289: MatSeqDenseCUDACopyToGPU(A);
290: *a = dA->d_v;
291: return 0;
292: }
294: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
295: {
296: if (a) *a = NULL;
297: return 0;
298: }
300: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
301: {
302: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
304: MatSeqDenseCUDACopyToGPU(A);
305: *a = dA->d_v;
306: return 0;
307: }
309: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
310: {
311: if (a) *a = NULL;
312: return 0;
313: }
315: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
316: {
317: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
318: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
319: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
320: PetscScalar *da;
321: cusolverDnHandle_t handle;
322: PetscCuBLASInt n,lda;
323: #if defined(PETSC_USE_DEBUG)
324: PetscCuBLASInt info;
325: #endif
327: if (!A->rmap->n || !A->cmap->n) return 0;
328: PetscCUSOLVERDnGetHandle(&handle);
329: PetscCuBLASIntCast(A->cmap->n,&n);
330: PetscCuBLASIntCast(a->lda,&lda);
332: if (A->factortype == MAT_FACTOR_CHOLESKY) {
333: if (!dA->d_fact_ipiv) { /* spd */
334: PetscCuBLASInt il;
336: MatDenseCUDAGetArray(A,&da);
337: cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);
338: if (il > dA->fact_lwork) {
339: dA->fact_lwork = il;
341: cudaFree(dA->d_fact_work);
342: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
343: }
344: PetscLogGpuTimeBegin();
345: cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
346: PetscLogGpuTimeEnd();
347: MatDenseCUDARestoreArray(A,&da);
348: /* TODO (write cuda kernel) */
349: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
350: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
351: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Not implemented");
352: #if defined(PETSC_USE_DEBUG)
353: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
356: #endif
357: PetscLogGpuFlops(1.0*n*n*n/3.0);
358: A->ops->solve = NULL;
359: A->ops->solvetranspose = NULL;
360: A->ops->matsolve = NULL;
361: A->factortype = MAT_FACTOR_NONE;
363: PetscFree(A->solvertype);
364: return 0;
365: #else
366: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
367: #endif
368: }
370: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose,
371: PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
372: {
373: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
374: PetscScalar *y;
375: PetscCuBLASInt m=0, k=0;
376: PetscBool xiscuda, yiscuda, aiscuda;
379: PetscCuBLASIntCast(A->rmap->n,&m);
380: PetscCuBLASIntCast(A->cmap->n,&k);
381: PetscObjectTypeCompare((PetscObject)xx,VECSEQCUDA,&xiscuda);
382: PetscObjectTypeCompare((PetscObject)yy,VECSEQCUDA,&yiscuda);
383: {
384: const PetscScalar *x;
385: PetscBool xishost = PETSC_TRUE;
387: /* The logic here is to try to minimize the amount of memory copying:
388: if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
389: data is not offloaded to the GPU yet, then the data is copied to the
390: GPU. But we are only trying to get the data in order to copy it into the y
391: array. So the array x will be wherever the data already is so that
392: only one memcpy is performed */
393: if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
394: VecCUDAGetArrayRead(xx, &x);
395: xishost = PETSC_FALSE;
396: } else {
397: VecGetArrayRead(xx, &x);
398: }
399: if (k < m || !yiscuda) {
400: if (!dA->workvec) {
401: VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
402: }
403: VecCUDAGetArrayWrite(dA->workvec, &y);
404: } else {
405: VecCUDAGetArrayWrite(yy,&y);
406: }
407: cudaMemcpy(y,x,m*sizeof(PetscScalar),xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);
408: }
409: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
410: if (!aiscuda) {
411: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
412: }
413: (*matsolve) (A, y, m, m, 1, k, transpose);
414: if (!aiscuda) {
415: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
416: }
417: if (k < m || !yiscuda) {
418: PetscScalar *yv;
420: /* The logic here is that the data is not yet in either yy's GPU array or its
421: CPU array. There is nothing in the interface to say where the user would like
422: it to end up. So we choose the GPU, because it is the faster option */
423: if (yiscuda) {
424: VecCUDAGetArrayWrite(yy,&yv);
425: } else {
426: VecGetArray(yy,&yv);
427: }
428: cudaMemcpy(yv,y,k*sizeof(PetscScalar),yiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);
429: if (yiscuda) {
430: VecCUDARestoreArrayWrite(yy,&yv);
431: } else {
432: VecRestoreArray(yy,&yv);
433: }
434: VecCUDARestoreArrayWrite(dA->workvec, &y);
435: } else {
436: VecCUDARestoreArrayWrite(yy,&y);
437: }
438: return 0;
439: }
441: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose,
442: PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
443: {
444: PetscScalar *y;
445: PetscInt n, _ldb, _ldx;
446: PetscBool biscuda, xiscuda, aiscuda;
447: PetscCuBLASInt nrhs=0,m=0,k=0,ldb=0,ldx=0,ldy=0;
450: PetscCuBLASIntCast(A->rmap->n,&m);
451: PetscCuBLASIntCast(A->cmap->n,&k);
452: MatGetSize(B,NULL,&n);
453: PetscCuBLASIntCast(n,&nrhs);
454: MatDenseGetLDA(B,&_ldb);
455: PetscCuBLASIntCast(_ldb, &ldb);
456: MatDenseGetLDA(X,&_ldx);
457: PetscCuBLASIntCast(_ldx, &ldx);
459: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
460: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&xiscuda);
461: {
462: /* The logic here is to try to minimize the amount of memory copying:
463: if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
464: data is not offloaded to the GPU yet, then the data is copied to the
465: GPU. But we are only trying to get the data in order to copy it into the y
466: array. So the array b will be wherever the data already is so that
467: only one memcpy is performed */
468: const PetscScalar *b;
470: /* some copying from B will be involved */
471: PetscBool bishost = PETSC_TRUE;
473: if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
474: MatDenseCUDAGetArrayRead(B,&b);
475: bishost = PETSC_FALSE;
476: } else {
477: MatDenseGetArrayRead(B,&b);
478: }
479: if (ldx < m || !xiscuda) {
480: /* X's array cannot serve as the array (too small or not on device), B's
481: * array cannot serve as the array (const), so allocate a new array */
482: ldy = m;
483: cudaMalloc((void**)&y,nrhs*m*sizeof(PetscScalar));
484: } else {
485: /* X's array should serve as the array */
486: ldy = ldx;
487: MatDenseCUDAGetArrayWrite(X,&y);
488: }
489: cudaMemcpy2D(y,ldy*sizeof(PetscScalar),b,ldb*sizeof(PetscScalar),m*sizeof(PetscScalar),nrhs,bishost ? cudaMemcpyHostToDevice: cudaMemcpyDeviceToDevice);
490: if (bishost) {
491: MatDenseRestoreArrayRead(B,&b);
492: } else {
493: MatDenseCUDARestoreArrayRead(B,&b);
494: }
495: }
496: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
497: if (!aiscuda) {
498: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
499: }
500: (*matsolve) (A, y, ldy, m, nrhs, k, transpose);
501: if (!aiscuda) {
502: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
503: }
504: if (ldx < m || !xiscuda) {
505: PetscScalar *x;
507: /* The logic here is that the data is not yet in either X's GPU array or its
508: CPU array. There is nothing in the interface to say where the user would like
509: it to end up. So we choose the GPU, because it is the faster option */
510: if (xiscuda) {
511: MatDenseCUDAGetArrayWrite(X,&x);
512: } else {
513: MatDenseGetArray(X,&x);
514: }
515: cudaMemcpy2D(x,ldx*sizeof(PetscScalar),y,ldy*sizeof(PetscScalar),k*sizeof(PetscScalar),nrhs,xiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);
516: if (xiscuda) {
517: MatDenseCUDARestoreArrayWrite(X,&x);
518: } else {
519: MatDenseRestoreArray(X,&x);
520: }
521: cudaFree(y);
522: } else {
523: MatDenseCUDARestoreArrayWrite(X,&y);
524: }
525: return 0;
526: }
528: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
529: {
530: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
531: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
532: const PetscScalar *da;
533: PetscCuBLASInt lda;
534: cusolverDnHandle_t handle;
535: int info;
537: MatDenseCUDAGetArrayRead(A,&da);
538: PetscCuBLASIntCast(mat->lda,&lda);
539: PetscCUSOLVERDnGetHandle(&handle);
540: PetscLogGpuTimeBegin();
541: PetscInfo(A,"LU solve %d x %d on backend\n",m,k);
542: cusolverDnXgetrs(handle,T ? CUBLAS_OP_T : CUBLAS_OP_N,m,nrhs,da,lda,dA->d_fact_ipiv,x,ldx,dA->d_fact_info);
543: PetscLogGpuTimeEnd();
544: MatDenseCUDARestoreArrayRead(A,&da);
545: if (PetscDefined(USE_DEBUG)) {
546: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
549: }
550: PetscLogGpuFlops(nrhs*(2.0*m*m - m));
551: return 0;
552: }
554: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
555: {
556: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
557: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
558: const PetscScalar *da;
559: PetscCuBLASInt lda;
560: cusolverDnHandle_t handle;
561: int info;
563: MatDenseCUDAGetArrayRead(A,&da);
564: PetscCuBLASIntCast(mat->lda,&lda);
565: PetscCUSOLVERDnGetHandle(&handle);
566: PetscLogGpuTimeBegin();
567: PetscInfo(A,"Cholesky solve %d x %d on backend\n",m,k);
568: if (!dA->d_fact_ipiv) { /* spd */
569: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
570: cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,m,nrhs,da,lda,x,ldx,dA->d_fact_info);
571: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
572: PetscLogGpuTimeEnd();
573: MatDenseCUDARestoreArrayRead(A,&da);
574: if (PetscDefined(USE_DEBUG)) {
575: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
578: }
579: PetscLogGpuFlops(nrhs*(2.0*m*m - m));
580: return 0;
581: }
583: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
584: {
585: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
586: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
587: const PetscScalar *da;
588: PetscCuBLASInt lda, rank;
589: cusolverDnHandle_t handle;
590: cublasHandle_t bhandle;
591: int info;
592: cublasOperation_t trans;
593: PetscScalar one = 1.;
595: PetscCuBLASIntCast(mat->rank,&rank);
596: MatDenseCUDAGetArrayRead(A,&da);
597: PetscCuBLASIntCast(mat->lda,&lda);
598: PetscCUSOLVERDnGetHandle(&handle);
599: PetscCUBLASGetHandle(&bhandle);
600: PetscLogGpuTimeBegin();
601: PetscInfo(A,"QR solve %d x %d on backend\n",m,k);
602: if (!T) {
603: if (PetscDefined(USE_COMPLEX)) {
604: trans = CUBLAS_OP_C;
605: } else {
606: trans = CUBLAS_OP_T;
607: }
608: cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
609: if (PetscDefined(USE_DEBUG)) {
610: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
612: }
613: cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
614: } else {
615: cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
616: cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
617: if (PetscDefined(USE_DEBUG)) {
618: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
620: }
621: }
622: PetscLogGpuTimeEnd();
623: MatDenseCUDARestoreArrayRead(A,&da);
624: PetscLogFlops(nrhs*(4.0*m*mat->rank - PetscSqr(mat->rank)));
625: return 0;
626: }
628: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
629: {
630: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
631: return 0;
632: }
634: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
635: {
636: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
637: return 0;
638: }
640: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
641: {
642: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
643: return 0;
644: }
646: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
647: {
648: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
649: return 0;
650: }
652: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
653: {
654: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
655: return 0;
656: }
658: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
659: {
660: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
661: return 0;
662: }
664: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
665: {
666: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
667: return 0;
668: }
670: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
671: {
672: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
673: return 0;
674: }
676: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
677: {
678: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
679: return 0;
680: }
682: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
683: {
684: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
685: return 0;
686: }
688: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
689: {
690: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
691: return 0;
692: }
694: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
695: {
696: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
697: return 0;
698: }
700: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
701: {
702: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
703: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
704: PetscScalar *da;
705: PetscCuBLASInt m,n,lda;
706: #if defined(PETSC_USE_DEBUG)
707: int info;
708: #endif
709: cusolverDnHandle_t handle;
711: if (!A->rmap->n || !A->cmap->n) return 0;
712: PetscCUSOLVERDnGetHandle(&handle);
713: MatDenseCUDAGetArray(A,&da);
714: PetscCuBLASIntCast(A->cmap->n,&n);
715: PetscCuBLASIntCast(A->rmap->n,&m);
716: PetscCuBLASIntCast(a->lda,&lda);
717: PetscInfo(A,"LU factor %d x %d on backend\n",m,n);
718: if (!dA->d_fact_ipiv) {
719: cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));
720: }
721: if (!dA->fact_lwork) {
722: cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);
723: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
724: }
725: if (!dA->d_fact_info) {
726: cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
727: }
728: PetscLogGpuTimeBegin();
729: cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);
730: PetscLogGpuTimeEnd();
731: MatDenseCUDARestoreArray(A,&da);
732: #if defined(PETSC_USE_DEBUG)
733: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
736: #endif
737: A->factortype = MAT_FACTOR_LU;
738: PetscLogGpuFlops(2.0*n*n*m/3.0);
740: A->ops->solve = MatSolve_SeqDenseCUDA_LU;
741: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_LU;
742: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_LU;
743: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;
745: PetscFree(A->solvertype);
746: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
747: return 0;
748: }
750: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
751: {
752: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
753: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
754: PetscScalar *da;
755: PetscCuBLASInt n,lda;
756: #if defined(PETSC_USE_DEBUG)
757: int info;
758: #endif
759: cusolverDnHandle_t handle;
761: if (!A->rmap->n || !A->cmap->n) return 0;
762: PetscCUSOLVERDnGetHandle(&handle);
763: PetscCuBLASIntCast(A->rmap->n,&n);
764: PetscInfo(A,"Cholesky factor %d x %d on backend\n",n,n);
765: if (A->spd) {
766: MatDenseCUDAGetArray(A,&da);
767: PetscCuBLASIntCast(a->lda,&lda);
768: if (!dA->fact_lwork) {
769: cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);
770: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
771: }
772: if (!dA->d_fact_info) {
773: cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
774: }
775: PetscLogGpuTimeBegin();
776: cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
777: PetscLogGpuTimeEnd();
779: MatDenseCUDARestoreArray(A,&da);
780: #if defined(PETSC_USE_DEBUG)
781: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
784: #endif
785: A->factortype = MAT_FACTOR_CHOLESKY;
786: PetscLogGpuFlops(1.0*n*n*n/3.0);
787: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
788: #if 0
789: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
790: The code below should work, and it can be activated when *sytrs routines will be available */
791: if (!dA->d_fact_ipiv) {
792: cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));
793: }
794: if (!dA->fact_lwork) {
795: cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);
796: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
797: }
798: if (!dA->d_fact_info) {
799: cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
800: }
801: PetscLogGpuTimeBegin();
802: cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
803: PetscLogGpuTimeEnd();
804: #endif
806: A->ops->solve = MatSolve_SeqDenseCUDA_Cholesky;
807: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_Cholesky;
808: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_Cholesky;
809: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
810: PetscFree(A->solvertype);
811: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
812: return 0;
813: }
815: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A,IS col,const MatFactorInfo *factinfo)
816: {
817: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
818: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
819: PetscScalar *da;
820: PetscCuBLASInt m,min,max,n,lda;
821: #if defined(PETSC_USE_DEBUG)
822: int info;
823: #endif
824: cusolverDnHandle_t handle;
826: if (!A->rmap->n || !A->cmap->n) return 0;
827: PetscCUSOLVERDnGetHandle(&handle);
828: MatDenseCUDAGetArray(A,&da);
829: PetscCuBLASIntCast(A->cmap->n,&n);
830: PetscCuBLASIntCast(A->rmap->n,&m);
831: PetscCuBLASIntCast(a->lda,&lda);
832: PetscInfo(A,"QR factor %d x %d on backend\n",m,n);
833: max = PetscMax(m,n);
834: min = PetscMin(m,n);
835: if (!dA->d_fact_tau) cudaMalloc((void**)&dA->d_fact_tau,min*sizeof(*dA->d_fact_tau));
836: if (!dA->d_fact_ipiv) cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));
837: if (!dA->fact_lwork) {
838: cusolverDnXgeqrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);
839: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
840: }
841: if (!dA->d_fact_info) cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
842: if (!dA->workvec) VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
843: PetscLogGpuTimeBegin();
844: cusolverDnXgeqrf(handle,m,n,da,lda,dA->d_fact_tau,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
845: PetscLogGpuTimeEnd();
846: MatDenseCUDARestoreArray(A,&da);
847: #if defined(PETSC_USE_DEBUG)
848: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
850: #endif
851: A->factortype = MAT_FACTOR_QR;
852: a->rank = min;
853: PetscLogGpuFlops(2.0*min*min*(max-min/3.0));
855: A->ops->solve = MatSolve_SeqDenseCUDA_QR;
856: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_QR;
857: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_QR;
858: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;
860: PetscFree(A->solvertype);
861: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
862: return 0;
863: }
865: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
866: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
867: {
868: const PetscScalar *da,*db;
869: PetscScalar *dc;
870: PetscScalar one=1.0,zero=0.0;
871: PetscCuBLASInt m,n,k;
872: PetscInt alda,blda,clda;
873: cublasHandle_t cublasv2handle;
874: PetscBool Aiscuda,Biscuda;
875: cublasStatus_t berr;
877: /* we may end up with SEQDENSE as one of the arguments */
878: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
879: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
880: if (!Aiscuda) MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
881: if (!Biscuda) MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
882: PetscCuBLASIntCast(C->rmap->n,&m);
883: PetscCuBLASIntCast(C->cmap->n,&n);
884: if (tA) PetscCuBLASIntCast(A->rmap->n,&k);
885: else PetscCuBLASIntCast(A->cmap->n,&k);
886: if (!m || !n || !k) return 0;
887: PetscInfo(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
888: MatDenseCUDAGetArrayRead(A,&da);
889: MatDenseCUDAGetArrayRead(B,&db);
890: MatDenseCUDAGetArrayWrite(C,&dc);
891: MatDenseGetLDA(A,&alda);
892: MatDenseGetLDA(B,&blda);
893: MatDenseGetLDA(C,&clda);
894: PetscCUBLASGetHandle(&cublasv2handle);
895: PetscLogGpuTimeBegin();
896: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
897: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);berr;
898: PetscLogGpuTimeEnd();
899: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
900: MatDenseCUDARestoreArrayRead(A,&da);
901: MatDenseCUDARestoreArrayRead(B,&db);
902: MatDenseCUDARestoreArrayWrite(C,&dc);
903: if (!Aiscuda) MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
904: if (!Biscuda) MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
905: return 0;
906: }
908: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
909: {
910: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
911: return 0;
912: }
914: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
915: {
916: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
917: return 0;
918: }
920: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
921: {
922: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
923: return 0;
924: }
926: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
927: {
928: MatProductSetFromOptions_SeqDense(C);
929: return 0;
930: }
932: /* zz = op(A)*xx + yy
933: if yy == NULL, only MatMult */
934: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
935: {
936: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
937: const PetscScalar *xarray,*da;
938: PetscScalar *zarray;
939: PetscScalar one=1.0,zero=0.0;
940: PetscCuBLASInt m, n, lda;
941: cublasHandle_t cublasv2handle;
942: cublasStatus_t berr;
944: /* mult add */
945: if (yy && yy != zz) VecCopy_SeqCUDA(yy,zz);
946: if (!A->rmap->n || !A->cmap->n) {
947: /* mult only */
948: if (!yy) VecSet_SeqCUDA(zz,0.0);
949: return 0;
950: }
951: PetscInfo(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
952: PetscCuBLASIntCast(A->rmap->n,&m);
953: PetscCuBLASIntCast(A->cmap->n,&n);
954: PetscCUBLASGetHandle(&cublasv2handle);
955: MatDenseCUDAGetArrayRead(A,&da);
956: PetscCuBLASIntCast(mat->lda,&lda);
957: VecCUDAGetArrayRead(xx,&xarray);
958: VecCUDAGetArray(zz,&zarray);
959: PetscLogGpuTimeBegin();
960: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
961: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);berr;
962: PetscLogGpuTimeEnd();
963: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
964: VecCUDARestoreArrayRead(xx,&xarray);
965: VecCUDARestoreArray(zz,&zarray);
966: MatDenseCUDARestoreArrayRead(A,&da);
967: return 0;
968: }
970: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
971: {
972: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
973: return 0;
974: }
976: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
977: {
978: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
979: return 0;
980: }
982: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
983: {
984: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
985: return 0;
986: }
988: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
989: {
990: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
991: return 0;
992: }
994: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
995: {
996: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
998: MatSeqDenseCUDACopyFromGPU(A);
999: *array = mat->v;
1000: return 0;
1001: }
1003: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
1004: {
1005: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1007: /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
1008: if (!mat->v) MatSeqDenseSetPreallocation(A,NULL);
1009: *array = mat->v;
1010: A->offloadmask = PETSC_OFFLOAD_CPU;
1011: return 0;
1012: }
1014: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
1015: {
1016: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1018: MatSeqDenseCUDACopyFromGPU(A);
1019: *array = mat->v;
1020: A->offloadmask = PETSC_OFFLOAD_CPU;
1021: return 0;
1022: }
1024: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
1025: {
1026: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
1027: PetscScalar *dy;
1028: PetscCuBLASInt j,N,m,lday,one = 1;
1029: cublasHandle_t cublasv2handle;
1031: PetscCUBLASGetHandle(&cublasv2handle);
1032: MatDenseCUDAGetArray(Y,&dy);
1033: PetscCuBLASIntCast(Y->rmap->n*Y->cmap->n,&N);
1034: PetscCuBLASIntCast(Y->rmap->n,&m);
1035: PetscCuBLASIntCast(y->lda,&lday);
1036: PetscInfo(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1037: PetscLogGpuTimeBegin();
1038: if (lday>m) {
1039: for (j=0; j<Y->cmap->n; j++) cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);
1040: } else cublasXscal(cublasv2handle,N,&alpha,dy,one);
1041: PetscLogGpuTimeEnd();
1042: PetscLogGpuFlops(N);
1043: MatDenseCUDARestoreArray(Y,&dy);
1044: return 0;
1045: }
1047: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
1048: {
1049: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
1050: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
1051: const PetscScalar *dx;
1052: PetscScalar *dy;
1053: PetscCuBLASInt j,N,m,ldax,lday,one = 1;
1054: cublasHandle_t cublasv2handle;
1056: if (!X->rmap->n || !X->cmap->n) return 0;
1057: PetscCUBLASGetHandle(&cublasv2handle);
1058: MatDenseCUDAGetArrayRead(X,&dx);
1059: if (alpha == 0.0) MatDenseCUDAGetArrayWrite(Y,&dy);
1060: else MatDenseCUDAGetArray(Y,&dy);
1061: PetscCuBLASIntCast(X->rmap->n*X->cmap->n,&N);
1062: PetscCuBLASIntCast(X->rmap->n,&m);
1063: PetscCuBLASIntCast(x->lda,&ldax);
1064: PetscCuBLASIntCast(y->lda,&lday);
1065: PetscInfo(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1066: PetscLogGpuTimeBegin();
1067: if (ldax>m || lday>m) {
1068: for (j=0; j<X->cmap->n; j++) {
1069: cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);
1070: }
1071: } else cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);
1072: PetscLogGpuTimeEnd();
1073: PetscLogGpuFlops(PetscMax(2.*N-1,0));
1074: MatDenseCUDARestoreArrayRead(X,&dx);
1075: if (alpha == 0.0) MatDenseCUDARestoreArrayWrite(Y,&dy);
1076: else MatDenseCUDARestoreArray(Y,&dy);
1077: return 0;
1078: }
1080: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1081: {
1082: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1084: if (dA) {
1086: if (!dA->user_alloc) cudaFree(dA->d_v);
1087: cudaFree(dA->d_fact_tau);
1088: cudaFree(dA->d_fact_ipiv);
1089: cudaFree(dA->d_fact_info);
1090: cudaFree(dA->d_fact_work);
1091: VecDestroy(&dA->workvec);
1092: }
1093: PetscFree(A->spptr);
1094: return 0;
1095: }
1097: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1098: {
1099: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1101: /* prevent to copy back data if we own the data pointer */
1102: if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1103: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1104: MatDestroy_SeqDense(A);
1105: return 0;
1106: }
1108: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
1109: {
1110: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1112: MatCreate(PetscObjectComm((PetscObject)A),B);
1113: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1114: MatSetType(*B,((PetscObject)A)->type_name);
1115: MatDuplicateNoCreate_SeqDense(*B,A,hcpvalues);
1116: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) {
1117: MatCopy_SeqDenseCUDA(A,*B,SAME_NONZERO_PATTERN);
1118: }
1119: if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1120: Mat_SeqDenseCUDA *dB = (Mat_SeqDenseCUDA*)(*B)->spptr;
1121: if (!dB->d_v) {
1122: MatSeqDenseCUDASetPreallocation(*B,NULL);
1123: }
1124: }
1125: return 0;
1126: }
1128: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1129: {
1130: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1131: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1132: PetscScalar *x;
1133: PetscBool viscuda;
1135: PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1136: if (viscuda && !v->boundtocpu) { /* update device data */
1137: VecCUDAGetArrayWrite(v,&x);
1138: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1139: cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);
1140: } else {
1141: cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
1142: }
1143: VecCUDARestoreArrayWrite(v,&x);
1144: } else { /* update host data */
1145: VecGetArrayWrite(v,&x);
1146: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1147: PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1148: } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1149: cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1150: }
1151: VecRestoreArrayWrite(v,&x);
1152: }
1153: return 0;
1154: }
1156: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1157: {
1158: MatCreate(PetscObjectComm((PetscObject)A),fact);
1159: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1160: MatSetType(*fact,MATSEQDENSECUDA);
1161: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1162: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1163: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1164: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1165: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1166: } else if (ftype == MAT_FACTOR_QR) {
1167: PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactor_C",MatQRFactor_SeqDense);
1168: PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactorSymbolic_C",MatQRFactorSymbolic_SeqDense);
1169: }
1170: (*fact)->factortype = ftype;
1171: PetscFree((*fact)->solvertype);
1172: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1173: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1174: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1175: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1176: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1177: return 0;
1178: }
1180: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1181: {
1182: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1186: MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1187: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1188: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1189: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1190: }
1191: a->vecinuse = col + 1;
1192: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1193: *v = a->cvec;
1194: return 0;
1195: }
1197: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1198: {
1199: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1203: a->vecinuse = 0;
1204: VecCUDAResetArray(a->cvec);
1205: MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1206: if (v) *v = NULL;
1207: return 0;
1208: }
1210: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1211: {
1212: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1216: MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1217: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1218: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1219: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1220: }
1221: a->vecinuse = col + 1;
1222: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1223: VecLockReadPush(a->cvec);
1224: *v = a->cvec;
1225: return 0;
1226: }
1228: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1229: {
1230: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1234: a->vecinuse = 0;
1235: VecLockReadPop(a->cvec);
1236: VecCUDAResetArray(a->cvec);
1237: MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1238: if (v) *v = NULL;
1239: return 0;
1240: }
1242: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1243: {
1244: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1248: MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1249: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1250: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1251: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1252: }
1253: a->vecinuse = col + 1;
1254: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1255: *v = a->cvec;
1256: return 0;
1257: }
1259: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1260: {
1261: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1265: a->vecinuse = 0;
1266: VecCUDAResetArray(a->cvec);
1267: MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1268: if (v) *v = NULL;
1269: return 0;
1270: }
1272: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1273: {
1274: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1275: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1279: if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1280: MatDestroy(&a->cmat);
1281: }
1282: MatSeqDenseCUDACopyToGPU(A);
1283: if (!a->cmat) {
1284: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v+(size_t)cbegin*a->lda,&a->cmat);
1285: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1286: } else {
1287: MatDenseCUDAPlaceArray(a->cmat,dA->d_v+(size_t)cbegin*a->lda);
1288: }
1289: MatDenseSetLDA(a->cmat,a->lda);
1290: if (a->v) MatDensePlaceArray(a->cmat,a->v+(size_t)cbegin*a->lda);
1291: a->cmat->offloadmask = A->offloadmask;
1292: a->matinuse = cbegin + 1;
1293: *v = a->cmat;
1294: return 0;
1295: }
1297: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1298: {
1299: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1304: a->matinuse = 0;
1305: A->offloadmask = (a->cmat->offloadmask == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1306: MatDenseCUDAResetArray(a->cmat);
1307: if (a->unplacedarray) MatDenseResetArray(a->cmat);
1308: a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1309: if (v) *v = NULL;
1310: return 0;
1311: }
1313: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1314: {
1315: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
1316: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1317: PetscBool data;
1319: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1322: cA->lda = lda;
1323: return 0;
1324: }
1326: static PetscErrorCode MatSetUp_SeqDenseCUDA(Mat A)
1327: {
1328: PetscLayoutSetUp(A->rmap);
1329: PetscLayoutSetUp(A->cmap);
1330: if (!A->preallocated) MatSeqDenseCUDASetPreallocation(A,NULL);
1331: return 0;
1332: }
1334: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1335: {
1336: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1340: A->boundtocpu = flg;
1341: if (!flg) {
1342: PetscBool iscuda;
1344: PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1345: if (!iscuda) {
1346: VecDestroy(&a->cvec);
1347: }
1348: PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1349: if (!iscuda) {
1350: MatDestroy(&a->cmat);
1351: }
1352: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1353: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1354: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1355: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1356: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1357: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1358: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1359: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1360: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1361: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1362: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1363: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1364: PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDenseCUDA);
1366: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1367: A->ops->mult = MatMult_SeqDenseCUDA;
1368: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1369: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1370: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1371: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1372: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1373: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1374: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1375: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1376: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1377: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1378: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1379: A->ops->scale = MatScale_SeqDenseCUDA;
1380: A->ops->copy = MatCopy_SeqDenseCUDA;
1381: A->ops->zeroentries = MatZeroEntries_SeqDenseCUDA;
1382: A->ops->setup = MatSetUp_SeqDenseCUDA;
1383: } else {
1384: /* make sure we have an up-to-date copy on the CPU */
1385: MatSeqDenseCUDACopyFromGPU(A);
1386: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1387: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1388: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1389: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1390: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1391: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1392: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1393: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1394: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1395: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1396: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1397: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1398: PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDense);
1400: A->ops->duplicate = MatDuplicate_SeqDense;
1401: A->ops->mult = MatMult_SeqDense;
1402: A->ops->multadd = MatMultAdd_SeqDense;
1403: A->ops->multtranspose = MatMultTranspose_SeqDense;
1404: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1405: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1406: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1407: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1408: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1409: A->ops->axpy = MatAXPY_SeqDense;
1410: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1411: A->ops->lufactor = MatLUFactor_SeqDense;
1412: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1413: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1414: A->ops->scale = MatScale_SeqDense;
1415: A->ops->copy = MatCopy_SeqDense;
1416: A->ops->zeroentries = MatZeroEntries_SeqDense;
1417: A->ops->setup = MatSetUp_SeqDense;
1418: }
1419: if (a->cmat) {
1420: MatBindToCPU(a->cmat,flg);
1421: }
1422: return 0;
1423: }
1425: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1426: {
1427: Mat B;
1428: Mat_SeqDense *a;
1430: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1431: /* TODO these cases should be optimized */
1432: MatConvert_Basic(M,type,reuse,newmat);
1433: return 0;
1434: }
1436: B = *newmat;
1437: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1438: MatReset_SeqDenseCUDA(B);
1439: PetscFree(B->defaultvectype);
1440: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1441: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1442: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1443: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1444: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1445: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1446: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1447: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1448: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1449: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1450: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1451: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1452: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);
1453: a = (Mat_SeqDense*)B->data;
1454: VecDestroy(&a->cvec); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1455: B->ops->bindtocpu = NULL;
1456: B->ops->destroy = MatDestroy_SeqDense;
1457: B->offloadmask = PETSC_OFFLOAD_CPU;
1458: return 0;
1459: }
1461: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1462: {
1463: Mat_SeqDenseCUDA *dB;
1464: Mat B;
1465: Mat_SeqDense *a;
1467: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1468: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1469: /* TODO these cases should be optimized */
1470: MatConvert_Basic(M,type,reuse,newmat);
1471: return 0;
1472: }
1474: B = *newmat;
1475: PetscFree(B->defaultvectype);
1476: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1477: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1478: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1479: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1480: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1481: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1482: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1483: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1484: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1485: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1486: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1487: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1488: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);
1489: a = (Mat_SeqDense*)B->data;
1490: VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1491: PetscNewLog(B,&dB);
1493: B->spptr = dB;
1494: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1496: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1497: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1498: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1499: return 0;
1500: }
1502: /*@C
1503: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1505: Collective
1507: Input Parameters:
1508: + comm - MPI communicator
1509: . m - number of rows
1510: . n - number of columns
1511: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1512: to control matrix memory allocation.
1514: Output Parameter:
1515: . A - the matrix
1517: Notes:
1519: Level: intermediate
1521: .seealso: MatCreate(), MatCreateSeqDense()
1522: @*/
1523: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1524: {
1525: PetscMPIInt size;
1527: MPI_Comm_size(comm,&size);
1529: MatCreate(comm,A);
1530: MatSetSizes(*A,m,n,m,n);
1531: MatSetType(*A,MATSEQDENSECUDA);
1532: MatSeqDenseCUDASetPreallocation(*A,data);
1533: return 0;
1534: }
1536: /*MC
1537: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1539: Options Database Keys:
1540: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1542: Level: beginner
1543: M*/
1544: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1545: {
1546: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1547: MatCreate_SeqDense(B);
1548: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1549: return 0;
1550: }