Actual source code: densecuda.cu
petsc-3.13.6 2020-09-29
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsccublas.h>
9: /* cublas definitions are here */
10: #include <petsc/private/cudavecimpl.h>
12: #if defined(PETSC_USE_COMPLEX)
13: #if defined(PETSC_USE_REAL_SINGLE)
14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
24: #else /* complex double */
25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
35: #endif
36: #else /* real single */
37: #if defined(PETSC_USE_REAL_SINGLE)
38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
48: #else /* real double */
49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
59: #endif
60: #endif
62: typedef struct {
63: PetscScalar *d_v; /* pointer to the matrix on the GPU */
64: /* factorization support */
65: int *d_fact_ipiv; /* device pivots */
66: PetscScalar *d_fact_work; /* device workspace */
67: int fact_lwork;
68: int *d_fact_info; /* device info */
69: /* workspace */
70: Vec workvec;
71: } Mat_SeqDenseCUDA;
73: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
74: {
75: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
76: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
77: PetscErrorCode ierr;
78: cudaError_t cerr;
82: PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
83: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
84: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
85: if (cA->lda > A->rmap->n) {
86: PetscInt j,m = A->rmap->n;
88: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
89: cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
90: }
91: } else {
92: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
93: }
94: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
95: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
97: A->offloadmask = PETSC_OFFLOAD_BOTH;
98: }
99: return(0);
100: }
102: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
103: {
104: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
105: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
106: PetscBool copy;
107: PetscErrorCode ierr;
108: cudaError_t cerr;
112: if (A->boundtocpu) return(0);
113: if (!dA->d_v) {
114: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
115: }
116: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
117: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
118: if (copy) {
119: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
120: if (cA->lda > A->rmap->n) {
121: PetscInt j,m = A->rmap->n;
123: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
124: cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
125: }
126: } else {
127: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
128: }
129: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
130: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
132: A->offloadmask = PETSC_OFFLOAD_BOTH;
133: }
134: return(0);
135: }
137: PetscErrorCode MatDenseCUDAGetArrayWrite(Mat A, PetscScalar **a)
138: {
139: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
143: if (!dA->d_v) {
144: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
145: cudaError_t cerr;
147: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
148: }
149: *a = dA->d_v;
150: return(0);
151: }
153: PetscErrorCode MatDenseCUDARestoreArrayWrite(Mat A, PetscScalar **a)
154: {
156: *a = NULL;
157: A->offloadmask = PETSC_OFFLOAD_GPU;
158: return(0);
159: }
161: PetscErrorCode MatDenseCUDAGetArrayRead(Mat A, const PetscScalar **a)
162: {
163: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
164: PetscErrorCode ierr;
168: MatSeqDenseCUDACopyToGPU(A);
169: *a = dA->d_v;
170: return(0);
171: }
173: PetscErrorCode MatDenseCUDARestoreArrayRead(Mat A, const PetscScalar **a)
174: {
176: *a = NULL;
177: return(0);
178: }
180: PetscErrorCode MatDenseCUDAGetArray(Mat A, PetscScalar **a)
181: {
182: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
183: PetscErrorCode ierr;
187: MatSeqDenseCUDACopyToGPU(A);
188: *a = dA->d_v;
189: return(0);
190: }
192: PetscErrorCode MatDenseCUDARestoreArray(Mat A, PetscScalar **a)
193: {
195: *a = NULL;
196: A->offloadmask = PETSC_OFFLOAD_GPU;
197: return(0);
198: }
200: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
201: {
202: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
203: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
204: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
205: PetscScalar *da;
206: PetscErrorCode ierr;
207: cudaError_t ccer;
208: cusolverStatus_t cerr;
209: cusolverDnHandle_t handle;
210: int n,lda;
211: #if defined(PETSC_USE_DEBUG)
212: int info;
213: #endif
216: if (!A->rmap->n || !A->cmap->n) return(0);
217: PetscCUSOLVERDnGetHandle(&handle);
218: PetscMPIIntCast(A->cmap->n,&n);
219: PetscMPIIntCast(a->lda,&lda);
220: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
221: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
222: if (!dA->d_fact_ipiv) { /* spd */
223: int il;
225: MatDenseCUDAGetArray(A,&da);
226: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
227: if (il > dA->fact_lwork) {
228: dA->fact_lwork = il;
230: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
231: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
232: }
233: PetscLogGpuTimeBegin();
234: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
235: ccer = WaitForGPU();CHKERRCUDA(ccer);
236: PetscLogGpuTimeEnd();
237: MatDenseCUDARestoreArray(A,&da);
238: /* TODO (write cuda kernel) */
239: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
240: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
241: }
242: #if defined(PETSC_USE_DEBUG)
243: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
244: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
245: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
246: #endif
247: PetscLogGpuFlops(1.0*n*n*n/3.0);
248: A->ops->solve = NULL;
249: A->ops->solvetranspose = NULL;
250: A->ops->matsolve = NULL;
251: A->factortype = MAT_FACTOR_NONE;
253: PetscFree(A->solvertype);
254: return(0);
255: #else
256: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
257: #endif
258: }
260: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
261: {
262: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
263: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
264: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
265: const PetscScalar *da;
266: PetscScalar *dx;
267: cusolverDnHandle_t handle;
268: PetscBool iscuda;
269: int nrhs,n,lda,ldx;
270: #if defined(PETSC_USE_DEBUG)
271: int info;
272: #endif
273: cudaError_t ccer;
274: cusolverStatus_t cerr;
275: PetscErrorCode ierr;
278: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
279: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
280: PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
281: if (X != B) {
282: MatCopy(B,X,SAME_NONZERO_PATTERN);
283: }
284: MatDenseCUDAGetArrayRead(A,&da);
285: /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
286: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
287: if (!iscuda) {
288: MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
289: }
290: MatDenseCUDAGetArray(X,&dx);
291: PetscMPIIntCast(A->rmap->n,&n);
292: PetscMPIIntCast(X->cmap->n,&nrhs);
293: PetscMPIIntCast(a->lda,&lda);
294: PetscMPIIntCast(x->lda,&ldx);
295: PetscCUSOLVERDnGetHandle(&handle);
296: PetscLogGpuTimeBegin();
297: if (A->factortype == MAT_FACTOR_LU) {
298: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
299: cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
300: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
301: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
302: if (!dA->d_fact_ipiv) { /* spd */
303: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
304: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
305: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
306: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
307: ccer = WaitForGPU();CHKERRCUDA(ccer);
308: PetscLogGpuTimeEnd();
309: MatDenseCUDARestoreArrayRead(A,&da);
310: MatDenseCUDARestoreArray(X,&dx);
311: if (!iscuda) {
312: MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
313: }
314: #if defined(PETSC_USE_DEBUG)
315: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
316: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
317: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
318: #endif
319: PetscLogGpuFlops(nrhs*(2.0*n*n - n));
320: return(0);
321: }
323: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
324: {
325: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
326: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
327: const PetscScalar *da;
328: PetscScalar *y;
329: cusolverDnHandle_t handle;
330: int one = 1,n,lda;
331: #if defined(PETSC_USE_DEBUG)
332: int info;
333: #endif
334: cudaError_t ccer;
335: cusolverStatus_t cerr;
336: PetscBool iscuda;
337: PetscErrorCode ierr;
340: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
341: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
342: PetscMPIIntCast(A->rmap->n,&n);
343: /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
344: PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
345: if (iscuda) {
346: VecCopy(xx,yy);
347: VecCUDAGetArray(yy,&y);
348: } else {
349: if (!dA->workvec) {
350: MatCreateVecs(A,&dA->workvec,NULL);
351: }
352: VecCopy(xx,dA->workvec);
353: VecCUDAGetArray(dA->workvec,&y);
354: }
355: MatDenseCUDAGetArrayRead(A,&da);
356: PetscMPIIntCast(a->lda,&lda);
357: PetscCUSOLVERDnGetHandle(&handle);
358: PetscLogGpuTimeBegin();
359: if (A->factortype == MAT_FACTOR_LU) {
360: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
361: cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
362: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
363: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
364: if (!dA->d_fact_ipiv) { /* spd */
365: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
366: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
367: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
368: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
369: ccer = WaitForGPU();CHKERRCUDA(ccer);
370: PetscLogGpuTimeEnd();
371: if (iscuda) {
372: VecCUDARestoreArray(yy,&y);
373: } else {
374: VecCUDARestoreArray(dA->workvec,&y);
375: VecCopy(dA->workvec,yy);
376: }
377: MatDenseCUDARestoreArrayRead(A,&da);
378: #if defined(PETSC_USE_DEBUG)
379: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
380: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
381: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
382: #endif
383: PetscLogGpuFlops(2.0*n*n - n);
384: return(0);
385: }
387: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
388: {
389: PetscErrorCode ierr;
392: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
393: return(0);
394: }
396: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
397: {
398: PetscErrorCode ierr;
401: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
402: return(0);
403: }
405: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
406: {
407: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
408: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
409: PetscScalar *da;
410: int m,n,lda;
411: #if defined(PETSC_USE_DEBUG)
412: int info;
413: #endif
414: cusolverStatus_t cerr;
415: cusolverDnHandle_t handle;
416: cudaError_t ccer;
417: PetscErrorCode ierr;
420: if (!A->rmap->n || !A->cmap->n) return(0);
421: PetscCUSOLVERDnGetHandle(&handle);
422: MatDenseCUDAGetArray(A,&da);
423: PetscMPIIntCast(A->cmap->n,&n);
424: PetscMPIIntCast(A->rmap->n,&m);
425: PetscMPIIntCast(a->lda,&lda);
426: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
427: if (!dA->d_fact_ipiv) {
428: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
429: }
430: if (!dA->fact_lwork) {
431: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
432: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
433: }
434: if (!dA->d_fact_info) {
435: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
436: }
437: PetscLogGpuTimeBegin();
438: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
439: ccer = WaitForGPU();CHKERRCUDA(ccer);
440: PetscLogGpuTimeEnd();
441: MatDenseCUDARestoreArray(A,&da);
442: #if defined(PETSC_USE_DEBUG)
443: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
444: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
445: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
446: #endif
447: A->factortype = MAT_FACTOR_LU;
448: PetscLogGpuFlops(2.0*n*n*m/3.0);
450: A->ops->solve = MatSolve_SeqDenseCUDA;
451: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
452: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
454: PetscFree(A->solvertype);
455: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
456: return(0);
457: }
459: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
460: {
461: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
462: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
463: PetscScalar *da;
464: int n,lda;
465: #if defined(PETSC_USE_DEBUG)
466: int info;
467: #endif
468: cusolverStatus_t cerr;
469: cusolverDnHandle_t handle;
470: cudaError_t ccer;
471: PetscErrorCode ierr;
474: if (!A->rmap->n || !A->cmap->n) return(0);
475: PetscCUSOLVERDnGetHandle(&handle);
476: PetscMPIIntCast(A->rmap->n,&n);
477: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
478: if (A->spd) {
479: MatDenseCUDAGetArray(A,&da);
480: PetscMPIIntCast(a->lda,&lda);
481: if (!dA->fact_lwork) {
482: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
483: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
484: }
485: if (!dA->d_fact_info) {
486: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
487: }
488: PetscLogGpuTimeBegin();
489: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
490: ccer = WaitForGPU();CHKERRCUDA(ccer);
491: PetscLogGpuTimeEnd();
493: MatDenseCUDARestoreArray(A,&da);
494: #if defined(PETSC_USE_DEBUG)
495: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
496: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
497: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
498: #endif
499: A->factortype = MAT_FACTOR_CHOLESKY;
500: PetscLogGpuFlops(1.0*n*n*n/3.0);
501: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
502: #if 0
503: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
504: The code below should work, and it can be activated when *sytrs routines will be available */
505: if (!dA->d_fact_ipiv) {
506: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
507: }
508: if (!dA->fact_lwork) {
509: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
510: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
511: }
512: if (!dA->d_fact_info) {
513: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
514: }
515: PetscLogGpuTimeBegin();
516: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
517: PetscLogGpuTimeEnd();
518: #endif
520: A->ops->solve = MatSolve_SeqDenseCUDA;
521: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
522: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
524: PetscFree(A->solvertype);
525: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
526: return(0);
527: }
529: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
530: static PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA, PetscBool tB)
531: {
532: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
533: Mat_SeqDense *b = (Mat_SeqDense*)B->data;
534: Mat_SeqDense *c = (Mat_SeqDense*)C->data;
535: const PetscScalar *da,*db;
536: PetscScalar *dc;
537: PetscScalar one=1.0,zero=0.0;
538: int m,n,k,alda,blda,clda;
539: PetscErrorCode ierr;
540: cublasHandle_t cublasv2handle;
541: cublasStatus_t berr;
542: cudaError_t cerr;
545: PetscMPIIntCast(C->rmap->n,&m);
546: PetscMPIIntCast(C->cmap->n,&n);
547: if (tA) {
548: PetscMPIIntCast(A->rmap->n,&k);
549: } else {
550: PetscMPIIntCast(A->cmap->n,&k);
551: }
552: if (!m || !n || !k) return(0);
553: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
554: MatDenseCUDAGetArrayRead(A,&da);
555: MatDenseCUDAGetArrayRead(B,&db);
556: MatDenseCUDAGetArrayWrite(C,&dc);
557: PetscMPIIntCast(a->lda,&alda);
558: PetscMPIIntCast(b->lda,&blda);
559: PetscMPIIntCast(c->lda,&clda);
560: PetscCUBLASGetHandle(&cublasv2handle);
561: PetscLogGpuTimeBegin();
562: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
563: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
564: cerr = WaitForGPU();CHKERRCUDA(cerr);
565: PetscLogGpuTimeEnd();
566: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
567: MatDenseCUDARestoreArrayRead(A,&da);
568: MatDenseCUDARestoreArrayRead(B,&db);
569: MatDenseCUDARestoreArrayWrite(C,&dc);
570: return(0);
571: }
573: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
574: {
578: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
579: return(0);
580: }
582: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
583: {
587: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
588: return(0);
589: }
591: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
592: {
596: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
597: return(0);
598: }
600: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
601: {
605: MatProductSetFromOptions_SeqDense(C);
606: return(0);
607: }
609: /* zz = op(A)*xx + yy
610: if yy == NULL, only MatMult */
611: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
612: {
613: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
614: const PetscScalar *xarray,*da;
615: PetscScalar *zarray;
616: PetscScalar one=1.0,zero=0.0;
617: int m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
618: cublasHandle_t cublasv2handle;
619: cublasStatus_t berr;
620: PetscErrorCode ierr;
623: if (yy && yy != zz) { /* mult add */
624: VecCopy_SeqCUDA(yy,zz);
625: }
626: if (!A->rmap->n || !A->cmap->n) {
627: if (!yy) { /* mult only */
628: VecSet_SeqCUDA(zz,0.0);
629: }
630: return(0);
631: }
632: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
633: PetscMPIIntCast(A->rmap->n,&m);
634: PetscMPIIntCast(A->cmap->n,&n);
635: PetscMPIIntCast(mat->lda,&lda);
636: PetscCUBLASGetHandle(&cublasv2handle);
637: MatDenseCUDAGetArrayRead(A,&da);
638: VecCUDAGetArrayRead(xx,&xarray);
639: VecCUDAGetArray(zz,&zarray);
640: PetscLogGpuTimeBegin();
641: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
642: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
643: PetscLogGpuTimeEnd();
644: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
645: VecCUDARestoreArrayRead(xx,&xarray);
646: VecCUDARestoreArray(zz,&zarray);
647: MatDenseCUDARestoreArrayRead(A,&da);
648: return(0);
649: }
651: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
652: {
656: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
657: return(0);
658: }
660: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
661: {
665: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
666: return(0);
667: }
669: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
670: {
674: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
675: return(0);
676: }
678: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
679: {
683: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
684: return(0);
685: }
687: PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar *array[])
688: {
689: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
693: MatSeqDenseCUDACopyFromGPU(A);
694: *array = mat->v;
695: return(0);
696: }
698: PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
699: {
700: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
704: MatSeqDenseCUDACopyFromGPU(A);
705: *array = mat->v;
706: A->offloadmask = PETSC_OFFLOAD_CPU;
707: return(0);
708: }
710: PetscErrorCode MatDenseRestoreArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
711: {
713: return(0);
714: }
716: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
717: {
718: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
719: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
720: const PetscScalar *dx;
721: PetscScalar *dy;
722: int j,N,m,ldax,lday,one = 1;
723: cublasHandle_t cublasv2handle;
724: cublasStatus_t berr;
725: PetscErrorCode ierr;
726: cudaError_t cerr;
729: if (!X->rmap->n || !X->cmap->n) return(0);
730: PetscCUBLASGetHandle(&cublasv2handle);
731: MatDenseCUDAGetArrayRead(X,&dx);
732: if (alpha != 0.0) {
733: MatDenseCUDAGetArray(Y,&dy);
734: } else {
735: MatDenseCUDAGetArrayWrite(Y,&dy);
736: }
737: PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
738: PetscMPIIntCast(X->rmap->n,&m);
739: PetscMPIIntCast(x->lda,&ldax);
740: PetscMPIIntCast(y->lda,&lday);
741: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
742: PetscLogGpuTimeBegin();
743: if (ldax>m || lday>m) {
744: for (j=0; j<X->cmap->n; j++) {
745: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
746: }
747: } else {
748: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
749: }
750: cerr = WaitForGPU();CHKERRCUDA(cerr);
751: PetscLogGpuTimeEnd();
752: PetscLogGpuFlops(PetscMax(2.*N-1,0));
753: MatDenseCUDARestoreArrayRead(X,&dx);
754: if (alpha != 0.0) {
755: MatDenseCUDARestoreArray(Y,&dy);
756: } else {
757: MatDenseCUDARestoreArrayWrite(Y,&dy);
758: }
759: return(0);
760: }
762: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
763: {
764: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
765: cudaError_t cerr;
766: PetscErrorCode ierr;
769: if (dA) {
770: cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr);
771: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
772: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
773: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
774: VecDestroy(&dA->workvec);
775: }
776: PetscFree(A->spptr);
777: return(0);
778: }
780: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
781: {
782: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
786: /* prevent to copy back data if we own the data pointer */
787: if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
788: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
789: MatDestroy_SeqDense(A);
790: return(0);
791: }
793: PetscErrorCode MatSeqDenseSetPreallocation_SeqDenseCUDA(Mat B,PetscScalar *data)
794: {
795: Mat_SeqDense *b;
796: Mat_SeqDenseCUDA *dB;
797: cudaError_t cerr;
798: PetscErrorCode ierr;
801: PetscLayoutSetUp(B->rmap);
802: PetscLayoutSetUp(B->cmap);
803: b = (Mat_SeqDense*)B->data;
804: b->Mmax = B->rmap->n;
805: b->Nmax = B->cmap->n;
806: if (b->lda <= 0 || b->changelda) b->lda = B->rmap->n;
807: if (b->lda < B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Invalid lda %D < %D",b->lda,B->rmap->n);
809: PetscIntMultError(b->lda,b->Nmax,NULL);
811: MatReset_SeqDenseCUDA(B);
812: PetscNewLog(B,&dB);
813: B->spptr = dB;
814: cerr = cudaMalloc((void**)&dB->d_v,b->lda*b->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
816: if (!data) { /* petsc-allocated storage */
817: if (!b->user_alloc) { PetscFree(b->v); }
818: PetscCalloc1((size_t)b->lda*b->Nmax,&b->v);
819: PetscLogObjectMemory((PetscObject)B,b->lda*b->Nmax*sizeof(PetscScalar));
820: b->user_alloc = PETSC_FALSE;
821: } else { /* user-allocated storage */
822: if (!b->user_alloc) { PetscFree(b->v); }
823: b->v = data;
824: b->user_alloc = PETSC_TRUE;
825: }
826: B->offloadmask = PETSC_OFFLOAD_CPU;
827: B->preallocated = PETSC_TRUE;
828: B->assembled = PETSC_TRUE;
829: return(0);
830: }
832: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
833: {
837: MatCreate(PetscObjectComm((PetscObject)A),B);
838: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
839: MatSetType(*B,((PetscObject)A)->type_name);
840: MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
841: if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
842: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
843: const PetscScalar *da;
844: PetscScalar *db;
845: cudaError_t cerr;
847: MatDenseCUDAGetArrayRead(A,&da);
848: MatDenseCUDAGetArrayWrite(*B,&db);
849: if (a->lda > A->rmap->n) {
850: PetscInt j,m = A->rmap->n;
852: for (j=0; j<A->cmap->n; j++) { /* it can be done better */
853: cerr = cudaMemcpy(db+j*m,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
854: }
855: } else {
856: cerr = cudaMemcpy(db,da,a->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
857: }
858: MatDenseCUDARestoreArrayRead(A,&da);
859: MatDenseCUDARestoreArrayWrite(*B,&db);
860: (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
861: }
862: return(0);
863: }
865: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
866: {
870: MatCreate(PetscObjectComm((PetscObject)A),fact);
871: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
872: MatSetType(*fact,MATSEQDENSECUDA);
873: if (ftype == MAT_FACTOR_LU) {
874: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
875: } else {
876: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
877: }
878: (*fact)->factortype = ftype;
880: PetscFree((*fact)->solvertype);
881: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
882: return(0);
883: }
885: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
886: {
890: A->boundtocpu = flg;
891: if (!flg) {
892: PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDenseCUDA);
893: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA);
894: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA);
895: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C", MatDenseRestoreArray_SeqDenseCUDA);
897: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
898: A->ops->mult = MatMult_SeqDenseCUDA;
899: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
900: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
901: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
902: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
903: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
904: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
905: A->ops->axpy = MatAXPY_SeqDenseCUDA;
906: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
907: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
908: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
909: } else {
910: /* make sure we have an up-to-date copy on the CPU */
911: MatSeqDenseCUDACopyFromGPU(A);
912: PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDense);
913: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C", MatDenseGetArray_SeqDense);
914: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
915: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C", MatDenseRestoreArray_SeqDense);
917: A->ops->duplicate = MatDuplicate_SeqDense;
918: A->ops->mult = MatMult_SeqDense;
919: A->ops->multadd = MatMultAdd_SeqDense;
920: A->ops->multtranspose = MatMultTranspose_SeqDense;
921: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
922: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
923: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
924: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
925: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
926: A->ops->axpy = MatAXPY_SeqDense;
927: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
928: A->ops->lufactor = MatLUFactor_SeqDense;
929: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
930: }
931: return(0);
932: }
934: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
935: {
936: Mat B;
937: PetscErrorCode ierr;
940: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
941: /* TODO these cases should be optimized */
942: MatConvert_Basic(M,type,reuse,newmat);
943: return(0);
944: }
946: B = *newmat;
947: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
948: MatReset_SeqDenseCUDA(B);
949: PetscFree(B->defaultvectype);
950: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
951: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
952: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
954: B->ops->bindtocpu = NULL;
955: B->ops->destroy = MatDestroy_SeqDense;
956: B->offloadmask = PETSC_OFFLOAD_CPU;
957: return(0);
958: }
960: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
961: {
962: Mat_SeqDenseCUDA *dB;
963: Mat B;
964: PetscErrorCode ierr;
967: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
968: /* TODO these cases should be optimized */
969: MatConvert_Basic(M,type,reuse,newmat);
970: return(0);
971: }
973: B = *newmat;
974: PetscFree(B->defaultvectype);
975: PetscStrallocpy(VECCUDA,&B->defaultvectype);
976: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
977: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",MatConvert_SeqDenseCUDA_SeqDense);
979: MatReset_SeqDenseCUDA(B);
980: PetscNewLog(B,&dB);
981: B->spptr = dB;
983: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
985: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
986: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
987: B->ops->destroy = MatDestroy_SeqDenseCUDA;
988: return(0);
989: }
991: /*MC
992: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
994: Options Database Keys:
995: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
997: Level: beginner
999: .seealso: MatCreateSeqDenseCUDA()
1001: M*/
1002: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1003: {
1007: MatCreate_SeqDense(B);
1008: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1009: return(0);
1010: }