Actual source code: densecuda.cu
petsc-3.14.6 2021-03-30
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsccublas.h>
9: /* cublas definitions are here */
10: #include <petsc/private/cudavecimpl.h>
12: #if defined(PETSC_USE_COMPLEX)
13: #if defined(PETSC_USE_REAL_SINGLE)
14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
24: #else /* complex double */
25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
35: #endif
36: #else /* real single */
37: #if defined(PETSC_USE_REAL_SINGLE)
38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
48: #else /* real double */
49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
59: #endif
60: #endif
62: typedef struct {
63: PetscScalar *d_v; /* pointer to the matrix on the GPU */
64: PetscBool user_alloc;
65: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
66: PetscBool unplaced_user_alloc;
67: /* factorization support */
68: int *d_fact_ipiv; /* device pivots */
69: PetscScalar *d_fact_work; /* device workspace */
70: int fact_lwork;
71: int *d_fact_info; /* device info */
72: /* workspace */
73: Vec workvec;
74: } Mat_SeqDenseCUDA;
76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
77: {
78: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
79: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
80: PetscErrorCode ierr;
81: PetscBool iscuda;
82: cudaError_t cerr;
85: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
86: if (!iscuda) return(0);
87: /* it may happen CPU preallocation has not been performed */
88: PetscLayoutSetUp(A->rmap);
89: PetscLayoutSetUp(A->cmap);
90: if (cA->lda <= 0) cA->lda = A->rmap->n;
91: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
92: if (!d_data) { /* petsc-allocated storage */
93: PetscIntMultError(cA->lda,A->cmap->n,NULL);
94: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*A->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
95: dA->user_alloc = PETSC_FALSE;
96: } else { /* user-allocated storage */
97: dA->d_v = d_data;
98: dA->user_alloc = PETSC_TRUE;
99: A->offloadmask = PETSC_OFFLOAD_GPU;
100: }
101: A->preallocated = PETSC_TRUE;
102: A->assembled = PETSC_TRUE;
103: return(0);
104: }
106: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
107: {
108: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
109: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
110: PetscErrorCode ierr;
111: cudaError_t cerr;
115: PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
116: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
117: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
118: MatSeqDenseSetPreallocation(A,NULL);
119: }
120: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
121: if (cA->lda > A->rmap->n) {
122: PetscInt j,m = A->rmap->n;
124: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
125: cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
126: }
127: } else {
128: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129: }
130: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
131: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
133: A->offloadmask = PETSC_OFFLOAD_BOTH;
134: }
135: return(0);
136: }
138: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
139: {
140: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
141: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
142: PetscBool copy;
143: PetscErrorCode ierr;
144: cudaError_t cerr;
148: if (A->boundtocpu) return(0);
149: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
150: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
151: if (copy) {
152: if (!dA->d_v) { /* Allocate GPU memory if not present */
153: MatSeqDenseCUDASetPreallocation(A,NULL);
154: }
155: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
156: if (cA->lda > A->rmap->n) {
157: PetscInt j,m = A->rmap->n;
159: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
160: cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
161: }
162: } else {
163: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164: }
165: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
166: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
168: A->offloadmask = PETSC_OFFLOAD_BOTH;
169: }
170: return(0);
171: }
173: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
174: {
175: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
176: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
177: PetscErrorCode ierr;
180: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
181: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
182: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
183: if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
184: dA->unplacedarray = dA->d_v;
185: dA->unplaced_user_alloc = dA->user_alloc;
186: dA->d_v = (PetscScalar*)a;
187: dA->user_alloc = PETSC_TRUE;
188: return(0);
189: }
191: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
192: {
193: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
194: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
195: PetscErrorCode ierr;
198: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
199: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
200: if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
201: dA->d_v = dA->unplacedarray;
202: dA->user_alloc = dA->unplaced_user_alloc;
203: dA->unplacedarray = NULL;
204: return(0);
205: }
207: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
208: {
209: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
210: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
211: cudaError_t cerr;
214: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
215: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
216: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
217: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
218: dA->d_v = (PetscScalar*)a;
219: dA->user_alloc = PETSC_FALSE;
220: return(0);
221: }
223: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
224: {
225: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
226: PetscErrorCode ierr;
229: if (!dA->d_v) {
230: MatSeqDenseCUDASetPreallocation(A,NULL);
231: }
232: *a = dA->d_v;
233: return(0);
234: }
236: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
237: {
239: *a = NULL;
240: return(0);
241: }
243: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
244: {
245: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
246: PetscErrorCode ierr;
249: MatSeqDenseCUDACopyToGPU(A);
250: *a = dA->d_v;
251: return(0);
252: }
254: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
255: {
257: *a = NULL;
258: return(0);
259: }
261: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
262: {
263: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
264: PetscErrorCode ierr;
267: MatSeqDenseCUDACopyToGPU(A);
268: *a = dA->d_v;
269: return(0);
270: }
272: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
273: {
275: *a = NULL;
276: return(0);
277: }
279: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
280: {
281: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
282: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
283: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
284: PetscScalar *da;
285: PetscErrorCode ierr;
286: cudaError_t ccer;
287: cusolverStatus_t cerr;
288: cusolverDnHandle_t handle;
289: int n,lda;
290: #if defined(PETSC_USE_DEBUG)
291: int info;
292: #endif
295: if (!A->rmap->n || !A->cmap->n) return(0);
296: PetscCUSOLVERDnGetHandle(&handle);
297: PetscMPIIntCast(A->cmap->n,&n);
298: PetscMPIIntCast(a->lda,&lda);
299: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
300: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
301: if (!dA->d_fact_ipiv) { /* spd */
302: int il;
304: MatDenseCUDAGetArray(A,&da);
305: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
306: if (il > dA->fact_lwork) {
307: dA->fact_lwork = il;
309: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
310: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
311: }
312: PetscLogGpuTimeBegin();
313: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
314: ccer = WaitForCUDA();CHKERRCUDA(ccer);
315: PetscLogGpuTimeEnd();
316: MatDenseCUDARestoreArray(A,&da);
317: /* TODO (write cuda kernel) */
318: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
319: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
320: }
321: #if defined(PETSC_USE_DEBUG)
322: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
323: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
324: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
325: #endif
326: PetscLogGpuFlops(1.0*n*n*n/3.0);
327: A->ops->solve = NULL;
328: A->ops->solvetranspose = NULL;
329: A->ops->matsolve = NULL;
330: A->factortype = MAT_FACTOR_NONE;
332: PetscFree(A->solvertype);
333: return(0);
334: #else
335: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
336: #endif
337: }
339: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
340: {
341: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
342: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
343: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
344: const PetscScalar *da;
345: PetscScalar *dx;
346: cusolverDnHandle_t handle;
347: PetscBool iscuda;
348: int nrhs,n,lda,ldx;
349: #if defined(PETSC_USE_DEBUG)
350: int info;
351: #endif
352: cudaError_t ccer;
353: cusolverStatus_t cerr;
354: PetscErrorCode ierr;
357: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
358: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
359: PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
360: if (X != B) {
361: MatCopy(B,X,SAME_NONZERO_PATTERN);
362: }
363: MatDenseCUDAGetArrayRead(A,&da);
364: /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
365: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
366: if (!iscuda) {
367: MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
368: }
369: MatDenseCUDAGetArray(X,&dx);
370: PetscMPIIntCast(A->rmap->n,&n);
371: PetscMPIIntCast(X->cmap->n,&nrhs);
372: PetscMPIIntCast(a->lda,&lda);
373: PetscMPIIntCast(x->lda,&ldx);
374: PetscCUSOLVERDnGetHandle(&handle);
375: PetscLogGpuTimeBegin();
376: if (A->factortype == MAT_FACTOR_LU) {
377: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
378: cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
379: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
380: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
381: if (!dA->d_fact_ipiv) { /* spd */
382: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
383: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
384: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
385: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
386: ccer = WaitForCUDA();CHKERRCUDA(ccer);
387: PetscLogGpuTimeEnd();
388: MatDenseCUDARestoreArrayRead(A,&da);
389: MatDenseCUDARestoreArray(X,&dx);
390: if (!iscuda) {
391: MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
392: }
393: #if defined(PETSC_USE_DEBUG)
394: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
395: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
396: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
397: #endif
398: PetscLogGpuFlops(nrhs*(2.0*n*n - n));
399: return(0);
400: }
402: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
403: {
404: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
405: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
406: const PetscScalar *da;
407: PetscScalar *y;
408: cusolverDnHandle_t handle;
409: int one = 1,n,lda;
410: #if defined(PETSC_USE_DEBUG)
411: int info;
412: #endif
413: cudaError_t ccer;
414: cusolverStatus_t cerr;
415: PetscBool iscuda;
416: PetscErrorCode ierr;
419: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
420: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
421: PetscMPIIntCast(A->rmap->n,&n);
422: /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
423: PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
424: if (iscuda) {
425: VecCopy(xx,yy);
426: VecCUDAGetArray(yy,&y);
427: } else {
428: if (!dA->workvec) {
429: MatCreateVecs(A,&dA->workvec,NULL);
430: }
431: VecCopy(xx,dA->workvec);
432: VecCUDAGetArray(dA->workvec,&y);
433: }
434: MatDenseCUDAGetArrayRead(A,&da);
435: PetscMPIIntCast(a->lda,&lda);
436: PetscCUSOLVERDnGetHandle(&handle);
437: PetscLogGpuTimeBegin();
438: if (A->factortype == MAT_FACTOR_LU) {
439: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
440: cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
441: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
442: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
443: if (!dA->d_fact_ipiv) { /* spd */
444: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
445: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
446: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
447: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
448: ccer = WaitForCUDA();CHKERRCUDA(ccer);
449: PetscLogGpuTimeEnd();
450: if (iscuda) {
451: VecCUDARestoreArray(yy,&y);
452: } else {
453: VecCUDARestoreArray(dA->workvec,&y);
454: VecCopy(dA->workvec,yy);
455: }
456: MatDenseCUDARestoreArrayRead(A,&da);
457: #if defined(PETSC_USE_DEBUG)
458: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
459: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
460: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
461: #endif
462: PetscLogGpuFlops(2.0*n*n - n);
463: return(0);
464: }
466: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
467: {
468: PetscErrorCode ierr;
471: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
472: return(0);
473: }
475: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
476: {
477: PetscErrorCode ierr;
480: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
481: return(0);
482: }
484: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
485: {
486: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
487: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
488: PetscScalar *da;
489: int m,n,lda;
490: #if defined(PETSC_USE_DEBUG)
491: int info;
492: #endif
493: cusolverStatus_t cerr;
494: cusolverDnHandle_t handle;
495: cudaError_t ccer;
496: PetscErrorCode ierr;
499: if (!A->rmap->n || !A->cmap->n) return(0);
500: PetscCUSOLVERDnGetHandle(&handle);
501: MatDenseCUDAGetArray(A,&da);
502: PetscMPIIntCast(A->cmap->n,&n);
503: PetscMPIIntCast(A->rmap->n,&m);
504: PetscMPIIntCast(a->lda,&lda);
505: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
506: if (!dA->d_fact_ipiv) {
507: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
508: }
509: if (!dA->fact_lwork) {
510: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
511: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
512: }
513: if (!dA->d_fact_info) {
514: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
515: }
516: PetscLogGpuTimeBegin();
517: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
518: ccer = WaitForCUDA();CHKERRCUDA(ccer);
519: PetscLogGpuTimeEnd();
520: MatDenseCUDARestoreArray(A,&da);
521: #if defined(PETSC_USE_DEBUG)
522: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
523: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
524: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
525: #endif
526: A->factortype = MAT_FACTOR_LU;
527: PetscLogGpuFlops(2.0*n*n*m/3.0);
529: A->ops->solve = MatSolve_SeqDenseCUDA;
530: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
531: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
533: PetscFree(A->solvertype);
534: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
535: return(0);
536: }
538: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
539: {
540: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
541: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
542: PetscScalar *da;
543: int n,lda;
544: #if defined(PETSC_USE_DEBUG)
545: int info;
546: #endif
547: cusolverStatus_t cerr;
548: cusolverDnHandle_t handle;
549: cudaError_t ccer;
550: PetscErrorCode ierr;
553: if (!A->rmap->n || !A->cmap->n) return(0);
554: PetscCUSOLVERDnGetHandle(&handle);
555: PetscMPIIntCast(A->rmap->n,&n);
556: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
557: if (A->spd) {
558: MatDenseCUDAGetArray(A,&da);
559: PetscMPIIntCast(a->lda,&lda);
560: if (!dA->fact_lwork) {
561: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
562: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
563: }
564: if (!dA->d_fact_info) {
565: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
566: }
567: PetscLogGpuTimeBegin();
568: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
569: ccer = WaitForCUDA();CHKERRCUDA(ccer);
570: PetscLogGpuTimeEnd();
572: MatDenseCUDARestoreArray(A,&da);
573: #if defined(PETSC_USE_DEBUG)
574: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
575: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
576: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
577: #endif
578: A->factortype = MAT_FACTOR_CHOLESKY;
579: PetscLogGpuFlops(1.0*n*n*n/3.0);
580: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
581: #if 0
582: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
583: The code below should work, and it can be activated when *sytrs routines will be available */
584: if (!dA->d_fact_ipiv) {
585: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
586: }
587: if (!dA->fact_lwork) {
588: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
589: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
590: }
591: if (!dA->d_fact_info) {
592: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
593: }
594: PetscLogGpuTimeBegin();
595: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
596: PetscLogGpuTimeEnd();
597: #endif
599: A->ops->solve = MatSolve_SeqDenseCUDA;
600: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
601: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
603: PetscFree(A->solvertype);
604: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
605: return(0);
606: }
608: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
609: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
610: {
611: const PetscScalar *da,*db;
612: PetscScalar *dc;
613: PetscScalar one=1.0,zero=0.0;
614: int m,n,k;
615: PetscInt alda,blda,clda;
616: PetscErrorCode ierr;
617: cublasHandle_t cublasv2handle;
618: PetscBool Aiscuda,Biscuda;
619: cublasStatus_t berr;
620: cudaError_t cerr;
623: /* we may end up with SEQDENSE as one of the arguments */
624: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
625: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
626: if (!Aiscuda) {
627: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
628: }
629: if (!Biscuda) {
630: MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
631: }
632: PetscMPIIntCast(C->rmap->n,&m);
633: PetscMPIIntCast(C->cmap->n,&n);
634: if (tA) {
635: PetscMPIIntCast(A->rmap->n,&k);
636: } else {
637: PetscMPIIntCast(A->cmap->n,&k);
638: }
639: if (!m || !n || !k) return(0);
640: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
641: MatDenseCUDAGetArrayRead(A,&da);
642: MatDenseCUDAGetArrayRead(B,&db);
643: MatDenseCUDAGetArrayWrite(C,&dc);
644: MatDenseGetLDA(A,&alda);
645: MatDenseGetLDA(B,&blda);
646: MatDenseGetLDA(C,&clda);
647: PetscCUBLASGetHandle(&cublasv2handle);
648: PetscLogGpuTimeBegin();
649: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
650: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
651: cerr = WaitForCUDA();CHKERRCUDA(cerr);
652: PetscLogGpuTimeEnd();
653: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
654: MatDenseCUDARestoreArrayRead(A,&da);
655: MatDenseCUDARestoreArrayRead(B,&db);
656: MatDenseCUDARestoreArrayWrite(C,&dc);
657: if (!Aiscuda) {
658: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
659: }
660: if (!Biscuda) {
661: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
662: }
663: return(0);
664: }
666: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
667: {
671: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
672: return(0);
673: }
675: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
676: {
680: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
681: return(0);
682: }
684: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
685: {
689: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
690: return(0);
691: }
693: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
694: {
698: MatProductSetFromOptions_SeqDense(C);
699: return(0);
700: }
702: /* zz = op(A)*xx + yy
703: if yy == NULL, only MatMult */
704: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
705: {
706: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
707: const PetscScalar *xarray,*da;
708: PetscScalar *zarray;
709: PetscScalar one=1.0,zero=0.0;
710: int m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
711: cublasHandle_t cublasv2handle;
712: cublasStatus_t berr;
713: PetscErrorCode ierr;
716: if (yy && yy != zz) { /* mult add */
717: VecCopy_SeqCUDA(yy,zz);
718: }
719: if (!A->rmap->n || !A->cmap->n) {
720: if (!yy) { /* mult only */
721: VecSet_SeqCUDA(zz,0.0);
722: }
723: return(0);
724: }
725: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
726: PetscMPIIntCast(A->rmap->n,&m);
727: PetscMPIIntCast(A->cmap->n,&n);
728: PetscMPIIntCast(mat->lda,&lda);
729: PetscCUBLASGetHandle(&cublasv2handle);
730: MatDenseCUDAGetArrayRead(A,&da);
731: VecCUDAGetArrayRead(xx,&xarray);
732: VecCUDAGetArray(zz,&zarray);
733: PetscLogGpuTimeBegin();
734: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
735: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
736: PetscLogGpuTimeEnd();
737: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
738: VecCUDARestoreArrayRead(xx,&xarray);
739: VecCUDARestoreArray(zz,&zarray);
740: MatDenseCUDARestoreArrayRead(A,&da);
741: return(0);
742: }
744: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
745: {
749: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
750: return(0);
751: }
753: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
754: {
758: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
759: return(0);
760: }
762: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
763: {
767: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
768: return(0);
769: }
771: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
772: {
776: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
777: return(0);
778: }
780: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
781: {
782: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
786: MatSeqDenseCUDACopyFromGPU(A);
787: *array = mat->v;
788: return(0);
789: }
791: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
792: {
793: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
797: if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
798: MatSeqDenseSetPreallocation(A,NULL);
799: }
800: *array = mat->v;
801: A->offloadmask = PETSC_OFFLOAD_CPU;
802: return(0);
803: }
805: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
806: {
807: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
811: MatSeqDenseCUDACopyFromGPU(A);
812: *array = mat->v;
813: A->offloadmask = PETSC_OFFLOAD_CPU;
814: return(0);
815: }
817: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
818: {
819: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
820: PetscScalar *dy;
821: int j,N,m,lday,one = 1;
822: cublasHandle_t cublasv2handle;
823: cublasStatus_t berr;
825: cudaError_t cerr;
828: PetscCUBLASGetHandle(&cublasv2handle);
829: MatDenseCUDAGetArray(Y,&dy);
830: PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
831: PetscMPIIntCast(Y->rmap->n,&m);
832: PetscMPIIntCast(y->lda,&lday);
833: PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
834: PetscLogGpuTimeBegin();
835: if (lday>m) {
836: for (j=0; j<Y->cmap->n; j++) {
837: berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
838: }
839: } else {
840: berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
841: }
842: cerr = WaitForCUDA();CHKERRCUDA(cerr);
843: PetscLogGpuTimeEnd();
844: PetscLogGpuFlops(N);
845: MatDenseCUDARestoreArray(Y,&dy);
846: return(0);
847: }
849: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
850: {
851: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
852: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
853: const PetscScalar *dx;
854: PetscScalar *dy;
855: int j,N,m,ldax,lday,one = 1;
856: cublasHandle_t cublasv2handle;
857: cublasStatus_t berr;
858: PetscErrorCode ierr;
859: cudaError_t cerr;
862: if (!X->rmap->n || !X->cmap->n) return(0);
863: PetscCUBLASGetHandle(&cublasv2handle);
864: MatDenseCUDAGetArrayRead(X,&dx);
865: if (alpha != 0.0) {
866: MatDenseCUDAGetArray(Y,&dy);
867: } else {
868: MatDenseCUDAGetArrayWrite(Y,&dy);
869: }
870: PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
871: PetscMPIIntCast(X->rmap->n,&m);
872: PetscMPIIntCast(x->lda,&ldax);
873: PetscMPIIntCast(y->lda,&lday);
874: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
875: PetscLogGpuTimeBegin();
876: if (ldax>m || lday>m) {
877: for (j=0; j<X->cmap->n; j++) {
878: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
879: }
880: } else {
881: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
882: }
883: cerr = WaitForCUDA();CHKERRCUDA(cerr);
884: PetscLogGpuTimeEnd();
885: PetscLogGpuFlops(PetscMax(2.*N-1,0));
886: MatDenseCUDARestoreArrayRead(X,&dx);
887: if (alpha != 0.0) {
888: MatDenseCUDARestoreArray(Y,&dy);
889: } else {
890: MatDenseCUDARestoreArrayWrite(Y,&dy);
891: }
892: return(0);
893: }
895: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
896: {
897: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
898: cudaError_t cerr;
899: PetscErrorCode ierr;
902: if (dA) {
903: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
904: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
905: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
906: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
907: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
908: VecDestroy(&dA->workvec);
909: }
910: PetscFree(A->spptr);
911: return(0);
912: }
914: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
915: {
916: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
920: /* prevent to copy back data if we own the data pointer */
921: if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
922: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
923: MatDestroy_SeqDense(A);
924: return(0);
925: }
927: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
928: {
932: MatCreate(PetscObjectComm((PetscObject)A),B);
933: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
934: MatSetType(*B,((PetscObject)A)->type_name);
935: MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
936: if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
937: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
938: const PetscScalar *da;
939: PetscScalar *db;
940: cudaError_t cerr;
941: PetscInt ldb;
943: MatDenseCUDAGetArrayRead(A,&da);
944: MatDenseCUDAGetArrayWrite(*B,&db);
945: MatDenseGetLDA(*B,&ldb);
946: if (a->lda > A->rmap->n || ldb > A->rmap->n) {
947: PetscInt j,m = A->rmap->n;
949: for (j=0; j<A->cmap->n; j++) { /* it can be done better */
950: cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
951: }
952: } else {
953: cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
954: }
955: MatDenseCUDARestoreArrayRead(A,&da);
956: MatDenseCUDARestoreArrayWrite(*B,&db);
957: (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
958: }
959: return(0);
960: }
962: #include <petsc/private/vecimpl.h>
964: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
965: {
966: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
967: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
968: PetscErrorCode ierr;
969: PetscScalar *x;
970: PetscBool viscuda;
971: cudaError_t cerr;
974: PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
975: if (viscuda && !v->boundtocpu) { /* update device data */
976: VecCUDAGetArrayWrite(v,&x);
977: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
978: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
979: } else {
980: cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
981: }
982: VecCUDARestoreArrayWrite(v,&x);
983: } else { /* update host data */
984: VecGetArrayWrite(v,&x);
985: if (A->offloadmask & PETSC_OFFLOAD_CPU) {
986: PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
987: } else {
988: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
989: }
990: VecRestoreArrayWrite(v,&x);
991: }
992: return(0);
993: }
995: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
996: {
1000: MatCreate(PetscObjectComm((PetscObject)A),fact);
1001: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1002: MatSetType(*fact,MATSEQDENSECUDA);
1003: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1004: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1005: } else {
1006: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1007: }
1008: (*fact)->factortype = ftype;
1010: PetscFree((*fact)->solvertype);
1011: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1012: return(0);
1013: }
1015: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1016: {
1017: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1021: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1022: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1023: if (!a->cvec) {
1024: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1025: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1026: }
1027: a->vecinuse = col + 1;
1028: MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1029: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1030: *v = a->cvec;
1031: return(0);
1032: }
1034: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1035: {
1036: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1040: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1041: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1042: a->vecinuse = 0;
1043: MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1044: VecCUDAResetArray(a->cvec);
1045: *v = NULL;
1046: return(0);
1047: }
1049: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1050: {
1051: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1055: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1056: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1057: if (!a->cvec) {
1058: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1059: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1060: }
1061: a->vecinuse = col + 1;
1062: MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1063: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1064: VecLockReadPush(a->cvec);
1065: *v = a->cvec;
1066: return(0);
1067: }
1069: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1070: {
1071: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1075: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1076: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1077: a->vecinuse = 0;
1078: MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1079: VecLockReadPop(a->cvec);
1080: VecCUDAResetArray(a->cvec);
1081: *v = NULL;
1082: return(0);
1083: }
1085: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1086: {
1087: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1091: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1092: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1093: if (!a->cvec) {
1094: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1095: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1096: }
1097: a->vecinuse = col + 1;
1098: MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1099: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1100: *v = a->cvec;
1101: return(0);
1102: }
1104: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1105: {
1106: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1110: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1111: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1112: a->vecinuse = 0;
1113: MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1114: VecCUDAResetArray(a->cvec);
1115: *v = NULL;
1116: return(0);
1117: }
1119: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1120: {
1121: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1122: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1123: PetscErrorCode ierr;
1126: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1127: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1128: if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1129: MatDestroy(&a->cmat);
1130: }
1131: MatSeqDenseCUDACopyToGPU(A);
1132: if (!a->cmat) {
1133: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1134: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1135: } else {
1136: MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1137: }
1138: MatDenseSetLDA(a->cmat,a->lda);
1139: if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1140: a->cmat->offloadmask = A->offloadmask;
1141: a->matinuse = cbegin + 1;
1142: *v = a->cmat;
1143: return(0);
1144: }
1146: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1147: {
1148: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1152: if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1153: if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1154: if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1155: a->matinuse = 0;
1156: A->offloadmask = PETSC_OFFLOAD_GPU;
1157: MatDenseCUDAResetArray(a->cmat);
1158: MatDenseResetArray(a->cmat);
1159: *v = NULL;
1160: return(0);
1161: }
1163: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1164: {
1165: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
1166: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1167: PetscBool data;
1170: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1171: if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1172: if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1173: cA->lda = lda;
1174: return(0);
1175: }
1177: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1178: {
1179: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1183: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1184: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1185: A->boundtocpu = flg;
1186: if (!flg) {
1187: PetscBool iscuda;
1189: PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1190: if (!iscuda) {
1191: VecDestroy(&a->cvec);
1192: }
1193: PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1194: if (!iscuda) {
1195: MatDestroy(&a->cmat);
1196: }
1197: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1198: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1199: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1200: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1201: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1202: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1203: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1204: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1205: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1206: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1207: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1208: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1210: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1211: A->ops->mult = MatMult_SeqDenseCUDA;
1212: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1213: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1214: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1215: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1216: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1217: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1218: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1219: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1220: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1221: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1222: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1223: A->ops->scale = MatScale_SeqDenseCUDA;
1224: } else {
1225: /* make sure we have an up-to-date copy on the CPU */
1226: MatSeqDenseCUDACopyFromGPU(A);
1227: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1228: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1229: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1230: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1231: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1232: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1233: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1234: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1235: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1236: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1237: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1238: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1240: A->ops->duplicate = MatDuplicate_SeqDense;
1241: A->ops->mult = MatMult_SeqDense;
1242: A->ops->multadd = MatMultAdd_SeqDense;
1243: A->ops->multtranspose = MatMultTranspose_SeqDense;
1244: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1245: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1246: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1247: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1248: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1249: A->ops->axpy = MatAXPY_SeqDense;
1250: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1251: A->ops->lufactor = MatLUFactor_SeqDense;
1252: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1253: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1254: A->ops->scale = MatScale_SeqDense;
1255: }
1256: if (a->cmat) {
1257: MatBindToCPU(a->cmat,flg);
1258: }
1259: return(0);
1260: }
1262: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1263: {
1264: Mat B;
1265: PetscErrorCode ierr;
1268: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1269: /* TODO these cases should be optimized */
1270: MatConvert_Basic(M,type,reuse,newmat);
1271: return(0);
1272: }
1274: B = *newmat;
1275: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1276: MatReset_SeqDenseCUDA(B);
1277: PetscFree(B->defaultvectype);
1278: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1279: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1280: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1281: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1282: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1283: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1284: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1285: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1286: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1287: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1288: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1289: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1290: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);
1292: B->ops->bindtocpu = NULL;
1293: B->ops->destroy = MatDestroy_SeqDense;
1294: B->offloadmask = PETSC_OFFLOAD_CPU;
1295: return(0);
1296: }
1298: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1299: {
1300: Mat_SeqDenseCUDA *dB;
1301: Mat B;
1302: PetscErrorCode ierr;
1305: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1306: /* TODO these cases should be optimized */
1307: MatConvert_Basic(M,type,reuse,newmat);
1308: return(0);
1309: }
1311: B = *newmat;
1312: PetscFree(B->defaultvectype);
1313: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1314: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1315: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1316: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1317: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1318: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1319: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1320: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1321: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1322: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1323: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1324: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1325: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);
1327: PetscNewLog(B,&dB);
1328: B->spptr = dB;
1330: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1332: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1333: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1334: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1335: return(0);
1336: }
1338: /*@C
1339: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1341: Collective
1343: Input Parameters:
1344: + comm - MPI communicator
1345: . m - number of rows
1346: . n - number of columns
1347: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1348: to control matrix memory allocation.
1350: Output Parameter:
1351: . A - the matrix
1353: Notes:
1355: Level: intermediate
1357: .seealso: MatCreate(), MatCreateSeqDense()
1358: @*/
1359: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1360: {
1362: PetscMPIInt size;
1365: MPI_Comm_size(comm,&size);
1366: if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1367: MatCreate(comm,A);
1368: MatSetSizes(*A,m,n,m,n);
1369: MatSetType(*A,MATSEQDENSECUDA);
1370: MatSeqDenseCUDASetPreallocation(*A,data);
1371: return(0);
1372: }
1374: /*MC
1375: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1377: Options Database Keys:
1378: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1380: Level: beginner
1381: M*/
1382: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1383: {
1387: PetscCUDAInitializeCheck();
1388: MatCreate_SeqDense(B);
1389: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1390: return(0);
1391: }