Actual source code: densecuda.cu
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsccublas.h>
9: /* cublas definitions are here */
10: #include <petsc/private/cudavecimpl.h>
12: #if defined(PETSC_USE_COMPLEX)
13: #if defined(PETSC_USE_REAL_SINGLE)
14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
24: #else /* complex double */
25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
35: #endif
36: #else /* real single */
37: #if defined(PETSC_USE_REAL_SINGLE)
38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
48: #else /* real double */
49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
59: #endif
60: #endif
62: typedef struct {
63: PetscScalar *d_v; /* pointer to the matrix on the GPU */
64: PetscBool user_alloc;
65: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
66: PetscBool unplaced_user_alloc;
67: /* factorization support */
68: int *d_fact_ipiv; /* device pivots */
69: PetscScalar *d_fact_work; /* device workspace */
70: int fact_lwork;
71: int *d_fact_info; /* device info */
72: /* workspace */
73: Vec workvec;
74: } Mat_SeqDenseCUDA;
76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
77: {
78: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
79: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
80: PetscErrorCode ierr;
81: PetscBool iscuda;
82: cudaError_t cerr;
85: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
86: if (!iscuda) return(0);
87: /* it may happen CPU preallocation has not been performed */
88: PetscLayoutSetUp(A->rmap);
89: PetscLayoutSetUp(A->cmap);
90: if (cA->lda <= 0) cA->lda = A->rmap->n;
91: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
92: if (!d_data) { /* petsc-allocated storage */
93: size_t sz;
94: PetscIntMultError(cA->lda,A->cmap->n,NULL);
95: sz = cA->lda*A->cmap->n*sizeof(PetscScalar);
96: cerr = cudaMalloc((void**)&dA->d_v,sz);CHKERRCUDA(cerr);
97: cerr = cudaMemset(dA->d_v,0,sz);CHKERRCUDA(cerr);
98: dA->user_alloc = PETSC_FALSE;
99: } else { /* user-allocated storage */
100: dA->d_v = d_data;
101: dA->user_alloc = PETSC_TRUE;
102: }
103: A->offloadmask = PETSC_OFFLOAD_GPU;
104: A->preallocated = PETSC_TRUE;
105: A->assembled = PETSC_TRUE;
106: return(0);
107: }
109: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
110: {
111: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
112: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
113: PetscErrorCode ierr;
114: cudaError_t cerr;
118: PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
119: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
120: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
121: MatSeqDenseSetPreallocation(A,NULL);
122: }
123: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
124: if (cA->lda > A->rmap->n) {
125: PetscInt j,m = A->rmap->n;
127: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
128: cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129: }
130: } else {
131: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
132: }
133: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
134: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
136: A->offloadmask = PETSC_OFFLOAD_BOTH;
137: }
138: return(0);
139: }
141: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
142: {
143: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
144: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
145: PetscBool copy;
146: PetscErrorCode ierr;
147: cudaError_t cerr;
151: if (A->boundtocpu) return(0);
152: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
153: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
154: if (copy) {
155: if (!dA->d_v) { /* Allocate GPU memory if not present */
156: MatSeqDenseCUDASetPreallocation(A,NULL);
157: }
158: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
159: if (cA->lda > A->rmap->n) {
160: PetscInt j,m = A->rmap->n;
162: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
163: cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164: }
165: } else {
166: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
167: }
168: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
169: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
171: A->offloadmask = PETSC_OFFLOAD_BOTH;
172: }
173: return(0);
174: }
176: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
177: {
178: Mat_SeqDense *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
179: PetscErrorCode ierr;
180: const PetscScalar *va;
181: PetscScalar *vb;
182: PetscInt lda1=a->lda,lda2=b->lda, m=A->rmap->n,n=A->cmap->n, j;
183: cudaError_t cerr;
186: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
187: if (A->ops->copy != B->ops->copy) {
188: MatCopy_Basic(A,B,str);
189: return(0);
190: }
191: if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
192: MatDenseCUDAGetArrayRead(A,&va);
193: MatDenseCUDAGetArrayWrite(B,&vb);
194: PetscLogGpuTimeBegin();
195: if (lda1>m || lda2>m) {
196: for (j=0; j<n; j++) {
197: cerr = cudaMemcpy(vb+j*lda2,va+j*lda1,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
198: }
199: } else {
200: cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
201: }
202: cerr = WaitForCUDA();CHKERRCUDA(cerr);
203: PetscLogGpuTimeEnd();
204: MatDenseCUDARestoreArray(B,&vb);
205: MatDenseCUDARestoreArrayRead(A,&va);
206: return(0);
207: }
209: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
210: {
211: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
212: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
213: PetscErrorCode ierr;
216: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
217: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
218: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
219: if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
220: dA->unplacedarray = dA->d_v;
221: dA->unplaced_user_alloc = dA->user_alloc;
222: dA->d_v = (PetscScalar*)a;
223: dA->user_alloc = PETSC_TRUE;
224: return(0);
225: }
227: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
228: {
229: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
230: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
231: PetscErrorCode ierr;
234: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
235: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
236: if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
237: dA->d_v = dA->unplacedarray;
238: dA->user_alloc = dA->unplaced_user_alloc;
239: dA->unplacedarray = NULL;
240: return(0);
241: }
243: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
244: {
245: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
246: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
247: cudaError_t cerr;
250: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
251: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
252: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
253: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
254: dA->d_v = (PetscScalar*)a;
255: dA->user_alloc = PETSC_FALSE;
256: return(0);
257: }
259: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
260: {
261: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
262: PetscErrorCode ierr;
265: if (!dA->d_v) {
266: MatSeqDenseCUDASetPreallocation(A,NULL);
267: }
268: *a = dA->d_v;
269: return(0);
270: }
272: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
273: {
275: *a = NULL;
276: return(0);
277: }
279: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
280: {
281: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
282: PetscErrorCode ierr;
285: MatSeqDenseCUDACopyToGPU(A);
286: *a = dA->d_v;
287: return(0);
288: }
290: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
291: {
293: *a = NULL;
294: return(0);
295: }
297: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
298: {
299: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
300: PetscErrorCode ierr;
303: MatSeqDenseCUDACopyToGPU(A);
304: *a = dA->d_v;
305: return(0);
306: }
308: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
309: {
311: *a = NULL;
312: return(0);
313: }
315: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
316: {
317: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
318: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
319: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
320: PetscScalar *da;
321: PetscErrorCode ierr;
322: cudaError_t ccer;
323: cusolverStatus_t cerr;
324: cusolverDnHandle_t handle;
325: int n,lda;
326: #if defined(PETSC_USE_DEBUG)
327: int info;
328: #endif
331: if (!A->rmap->n || !A->cmap->n) return(0);
332: PetscCUSOLVERDnGetHandle(&handle);
333: PetscMPIIntCast(A->cmap->n,&n);
334: PetscMPIIntCast(a->lda,&lda);
335: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
336: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
337: if (!dA->d_fact_ipiv) { /* spd */
338: int il;
340: MatDenseCUDAGetArray(A,&da);
341: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
342: if (il > dA->fact_lwork) {
343: dA->fact_lwork = il;
345: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
346: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
347: }
348: PetscLogGpuTimeBegin();
349: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
350: ccer = WaitForCUDA();CHKERRCUDA(ccer);
351: PetscLogGpuTimeEnd();
352: MatDenseCUDARestoreArray(A,&da);
353: /* TODO (write cuda kernel) */
354: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
355: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
356: }
357: #if defined(PETSC_USE_DEBUG)
358: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
359: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
360: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
361: #endif
362: PetscLogGpuFlops(1.0*n*n*n/3.0);
363: A->ops->solve = NULL;
364: A->ops->solvetranspose = NULL;
365: A->ops->matsolve = NULL;
366: A->factortype = MAT_FACTOR_NONE;
368: PetscFree(A->solvertype);
369: return(0);
370: #else
371: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
372: #endif
373: }
375: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
376: {
377: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
378: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
379: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
380: const PetscScalar *da;
381: PetscScalar *dx;
382: cusolverDnHandle_t handle;
383: PetscBool iscuda;
384: int nrhs,n,lda,ldx;
385: #if defined(PETSC_USE_DEBUG)
386: int info;
387: #endif
388: cudaError_t ccer;
389: cusolverStatus_t cerr;
390: PetscErrorCode ierr;
393: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
394: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
395: PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
396: if (X != B) {
397: MatCopy(B,X,SAME_NONZERO_PATTERN);
398: }
399: MatDenseCUDAGetArrayRead(A,&da);
400: /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
401: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
402: if (!iscuda) {
403: MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
404: }
405: MatDenseCUDAGetArray(X,&dx);
406: PetscMPIIntCast(A->rmap->n,&n);
407: PetscMPIIntCast(X->cmap->n,&nrhs);
408: PetscMPIIntCast(a->lda,&lda);
409: PetscMPIIntCast(x->lda,&ldx);
410: PetscCUSOLVERDnGetHandle(&handle);
411: PetscLogGpuTimeBegin();
412: if (A->factortype == MAT_FACTOR_LU) {
413: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
414: cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
415: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
416: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
417: if (!dA->d_fact_ipiv) { /* spd */
418: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
419: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
420: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
421: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
422: ccer = WaitForCUDA();CHKERRCUDA(ccer);
423: PetscLogGpuTimeEnd();
424: MatDenseCUDARestoreArrayRead(A,&da);
425: MatDenseCUDARestoreArray(X,&dx);
426: if (!iscuda) {
427: MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
428: }
429: #if defined(PETSC_USE_DEBUG)
430: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
431: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
432: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
433: #endif
434: PetscLogGpuFlops(nrhs*(2.0*n*n - n));
435: return(0);
436: }
438: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
439: {
440: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
441: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
442: const PetscScalar *da;
443: PetscScalar *y;
444: cusolverDnHandle_t handle;
445: int one = 1,n,lda;
446: #if defined(PETSC_USE_DEBUG)
447: int info;
448: #endif
449: cudaError_t ccer;
450: cusolverStatus_t cerr;
451: PetscBool iscuda;
452: PetscErrorCode ierr;
455: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
456: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
457: PetscMPIIntCast(A->rmap->n,&n);
458: /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
459: PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
460: if (iscuda) {
461: VecCopy(xx,yy);
462: VecCUDAGetArray(yy,&y);
463: } else {
464: if (!dA->workvec) {
465: MatCreateVecs(A,&dA->workvec,NULL);
466: }
467: VecCopy(xx,dA->workvec);
468: VecCUDAGetArray(dA->workvec,&y);
469: }
470: MatDenseCUDAGetArrayRead(A,&da);
471: PetscMPIIntCast(a->lda,&lda);
472: PetscCUSOLVERDnGetHandle(&handle);
473: PetscLogGpuTimeBegin();
474: if (A->factortype == MAT_FACTOR_LU) {
475: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
476: cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
477: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
478: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
479: if (!dA->d_fact_ipiv) { /* spd */
480: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
481: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
482: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
483: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
484: ccer = WaitForCUDA();CHKERRCUDA(ccer);
485: PetscLogGpuTimeEnd();
486: if (iscuda) {
487: VecCUDARestoreArray(yy,&y);
488: } else {
489: VecCUDARestoreArray(dA->workvec,&y);
490: VecCopy(dA->workvec,yy);
491: }
492: MatDenseCUDARestoreArrayRead(A,&da);
493: #if defined(PETSC_USE_DEBUG)
494: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
495: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
496: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
497: #endif
498: PetscLogGpuFlops(2.0*n*n - n);
499: return(0);
500: }
502: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
503: {
504: PetscErrorCode ierr;
507: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
508: return(0);
509: }
511: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
512: {
513: PetscErrorCode ierr;
516: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
517: return(0);
518: }
520: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
521: {
522: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
523: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
524: PetscScalar *da;
525: int m,n,lda;
526: #if defined(PETSC_USE_DEBUG)
527: int info;
528: #endif
529: cusolverStatus_t cerr;
530: cusolverDnHandle_t handle;
531: cudaError_t ccer;
532: PetscErrorCode ierr;
535: if (!A->rmap->n || !A->cmap->n) return(0);
536: PetscCUSOLVERDnGetHandle(&handle);
537: MatDenseCUDAGetArray(A,&da);
538: PetscMPIIntCast(A->cmap->n,&n);
539: PetscMPIIntCast(A->rmap->n,&m);
540: PetscMPIIntCast(a->lda,&lda);
541: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
542: if (!dA->d_fact_ipiv) {
543: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
544: }
545: if (!dA->fact_lwork) {
546: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
547: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
548: }
549: if (!dA->d_fact_info) {
550: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
551: }
552: PetscLogGpuTimeBegin();
553: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
554: ccer = WaitForCUDA();CHKERRCUDA(ccer);
555: PetscLogGpuTimeEnd();
556: MatDenseCUDARestoreArray(A,&da);
557: #if defined(PETSC_USE_DEBUG)
558: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
559: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
560: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
561: #endif
562: A->factortype = MAT_FACTOR_LU;
563: PetscLogGpuFlops(2.0*n*n*m/3.0);
565: A->ops->solve = MatSolve_SeqDenseCUDA;
566: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
567: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
569: PetscFree(A->solvertype);
570: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
571: return(0);
572: }
574: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
575: {
576: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
577: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
578: PetscScalar *da;
579: int n,lda;
580: #if defined(PETSC_USE_DEBUG)
581: int info;
582: #endif
583: cusolverStatus_t cerr;
584: cusolverDnHandle_t handle;
585: cudaError_t ccer;
586: PetscErrorCode ierr;
589: if (!A->rmap->n || !A->cmap->n) return(0);
590: PetscCUSOLVERDnGetHandle(&handle);
591: PetscMPIIntCast(A->rmap->n,&n);
592: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
593: if (A->spd) {
594: MatDenseCUDAGetArray(A,&da);
595: PetscMPIIntCast(a->lda,&lda);
596: if (!dA->fact_lwork) {
597: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
598: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
599: }
600: if (!dA->d_fact_info) {
601: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
602: }
603: PetscLogGpuTimeBegin();
604: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
605: ccer = WaitForCUDA();CHKERRCUDA(ccer);
606: PetscLogGpuTimeEnd();
608: MatDenseCUDARestoreArray(A,&da);
609: #if defined(PETSC_USE_DEBUG)
610: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
611: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
612: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
613: #endif
614: A->factortype = MAT_FACTOR_CHOLESKY;
615: PetscLogGpuFlops(1.0*n*n*n/3.0);
616: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
617: #if 0
618: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
619: The code below should work, and it can be activated when *sytrs routines will be available */
620: if (!dA->d_fact_ipiv) {
621: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
622: }
623: if (!dA->fact_lwork) {
624: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
625: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
626: }
627: if (!dA->d_fact_info) {
628: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
629: }
630: PetscLogGpuTimeBegin();
631: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
632: PetscLogGpuTimeEnd();
633: #endif
635: A->ops->solve = MatSolve_SeqDenseCUDA;
636: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
637: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
639: PetscFree(A->solvertype);
640: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
641: return(0);
642: }
644: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
645: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
646: {
647: const PetscScalar *da,*db;
648: PetscScalar *dc;
649: PetscScalar one=1.0,zero=0.0;
650: int m,n,k;
651: PetscInt alda,blda,clda;
652: PetscErrorCode ierr;
653: cublasHandle_t cublasv2handle;
654: PetscBool Aiscuda,Biscuda;
655: cublasStatus_t berr;
656: cudaError_t cerr;
659: /* we may end up with SEQDENSE as one of the arguments */
660: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
661: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
662: if (!Aiscuda) {
663: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
664: }
665: if (!Biscuda) {
666: MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
667: }
668: PetscMPIIntCast(C->rmap->n,&m);
669: PetscMPIIntCast(C->cmap->n,&n);
670: if (tA) {
671: PetscMPIIntCast(A->rmap->n,&k);
672: } else {
673: PetscMPIIntCast(A->cmap->n,&k);
674: }
675: if (!m || !n || !k) return(0);
676: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
677: MatDenseCUDAGetArrayRead(A,&da);
678: MatDenseCUDAGetArrayRead(B,&db);
679: MatDenseCUDAGetArrayWrite(C,&dc);
680: MatDenseGetLDA(A,&alda);
681: MatDenseGetLDA(B,&blda);
682: MatDenseGetLDA(C,&clda);
683: PetscCUBLASGetHandle(&cublasv2handle);
684: PetscLogGpuTimeBegin();
685: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
686: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
687: cerr = WaitForCUDA();CHKERRCUDA(cerr);
688: PetscLogGpuTimeEnd();
689: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
690: MatDenseCUDARestoreArrayRead(A,&da);
691: MatDenseCUDARestoreArrayRead(B,&db);
692: MatDenseCUDARestoreArrayWrite(C,&dc);
693: if (!Aiscuda) {
694: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
695: }
696: if (!Biscuda) {
697: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
698: }
699: return(0);
700: }
702: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
703: {
707: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
708: return(0);
709: }
711: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
712: {
716: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
717: return(0);
718: }
720: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
721: {
725: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
726: return(0);
727: }
729: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
730: {
734: MatProductSetFromOptions_SeqDense(C);
735: return(0);
736: }
738: /* zz = op(A)*xx + yy
739: if yy == NULL, only MatMult */
740: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
741: {
742: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
743: const PetscScalar *xarray,*da;
744: PetscScalar *zarray;
745: PetscScalar one=1.0,zero=0.0;
746: int m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
747: cublasHandle_t cublasv2handle;
748: cublasStatus_t berr;
749: PetscErrorCode ierr;
752: if (yy && yy != zz) { /* mult add */
753: VecCopy_SeqCUDA(yy,zz);
754: }
755: if (!A->rmap->n || !A->cmap->n) {
756: if (!yy) { /* mult only */
757: VecSet_SeqCUDA(zz,0.0);
758: }
759: return(0);
760: }
761: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
762: PetscMPIIntCast(A->rmap->n,&m);
763: PetscMPIIntCast(A->cmap->n,&n);
764: PetscMPIIntCast(mat->lda,&lda);
765: PetscCUBLASGetHandle(&cublasv2handle);
766: MatDenseCUDAGetArrayRead(A,&da);
767: VecCUDAGetArrayRead(xx,&xarray);
768: VecCUDAGetArray(zz,&zarray);
769: PetscLogGpuTimeBegin();
770: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
771: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
772: PetscLogGpuTimeEnd();
773: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
774: VecCUDARestoreArrayRead(xx,&xarray);
775: VecCUDARestoreArray(zz,&zarray);
776: MatDenseCUDARestoreArrayRead(A,&da);
777: return(0);
778: }
780: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
781: {
785: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
786: return(0);
787: }
789: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
790: {
794: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
795: return(0);
796: }
798: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
799: {
803: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
804: return(0);
805: }
807: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
808: {
812: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
813: return(0);
814: }
816: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
817: {
818: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
822: MatSeqDenseCUDACopyFromGPU(A);
823: *array = mat->v;
824: return(0);
825: }
827: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
828: {
829: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
833: if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
834: MatSeqDenseSetPreallocation(A,NULL);
835: }
836: *array = mat->v;
837: A->offloadmask = PETSC_OFFLOAD_CPU;
838: return(0);
839: }
841: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
842: {
843: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
847: MatSeqDenseCUDACopyFromGPU(A);
848: *array = mat->v;
849: A->offloadmask = PETSC_OFFLOAD_CPU;
850: return(0);
851: }
853: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
854: {
855: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
856: PetscScalar *dy;
857: int j,N,m,lday,one = 1;
858: cublasHandle_t cublasv2handle;
859: cublasStatus_t berr;
861: cudaError_t cerr;
864: PetscCUBLASGetHandle(&cublasv2handle);
865: MatDenseCUDAGetArray(Y,&dy);
866: PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
867: PetscMPIIntCast(Y->rmap->n,&m);
868: PetscMPIIntCast(y->lda,&lday);
869: PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
870: PetscLogGpuTimeBegin();
871: if (lday>m) {
872: for (j=0; j<Y->cmap->n; j++) {
873: berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
874: }
875: } else {
876: berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
877: }
878: cerr = WaitForCUDA();CHKERRCUDA(cerr);
879: PetscLogGpuTimeEnd();
880: PetscLogGpuFlops(N);
881: MatDenseCUDARestoreArray(Y,&dy);
882: return(0);
883: }
885: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
886: {
887: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
888: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
889: const PetscScalar *dx;
890: PetscScalar *dy;
891: int j,N,m,ldax,lday,one = 1;
892: cublasHandle_t cublasv2handle;
893: cublasStatus_t berr;
894: PetscErrorCode ierr;
895: cudaError_t cerr;
898: if (!X->rmap->n || !X->cmap->n) return(0);
899: PetscCUBLASGetHandle(&cublasv2handle);
900: MatDenseCUDAGetArrayRead(X,&dx);
901: if (alpha != 0.0) {
902: MatDenseCUDAGetArray(Y,&dy);
903: } else {
904: MatDenseCUDAGetArrayWrite(Y,&dy);
905: }
906: PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
907: PetscMPIIntCast(X->rmap->n,&m);
908: PetscMPIIntCast(x->lda,&ldax);
909: PetscMPIIntCast(y->lda,&lday);
910: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
911: PetscLogGpuTimeBegin();
912: if (ldax>m || lday>m) {
913: for (j=0; j<X->cmap->n; j++) {
914: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
915: }
916: } else {
917: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
918: }
919: cerr = WaitForCUDA();CHKERRCUDA(cerr);
920: PetscLogGpuTimeEnd();
921: PetscLogGpuFlops(PetscMax(2.*N-1,0));
922: MatDenseCUDARestoreArrayRead(X,&dx);
923: if (alpha != 0.0) {
924: MatDenseCUDARestoreArray(Y,&dy);
925: } else {
926: MatDenseCUDARestoreArrayWrite(Y,&dy);
927: }
928: return(0);
929: }
931: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
932: {
933: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
934: cudaError_t cerr;
935: PetscErrorCode ierr;
938: if (dA) {
939: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
940: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
941: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
942: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
943: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
944: VecDestroy(&dA->workvec);
945: }
946: PetscFree(A->spptr);
947: return(0);
948: }
950: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
951: {
952: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
956: /* prevent to copy back data if we own the data pointer */
957: if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
958: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
959: MatDestroy_SeqDense(A);
960: return(0);
961: }
963: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
964: {
968: MatCreate(PetscObjectComm((PetscObject)A),B);
969: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
970: MatSetType(*B,((PetscObject)A)->type_name);
971: MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
972: if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
973: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
974: const PetscScalar *da;
975: PetscScalar *db;
976: cudaError_t cerr;
977: PetscInt ldb;
979: MatDenseCUDAGetArrayRead(A,&da);
980: MatDenseCUDAGetArrayWrite(*B,&db);
981: MatDenseGetLDA(*B,&ldb);
982: if (a->lda > A->rmap->n || ldb > A->rmap->n) {
983: PetscInt j,m = A->rmap->n;
985: for (j=0; j<A->cmap->n; j++) { /* it can be done better */
986: cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
987: }
988: } else {
989: cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
990: }
991: MatDenseCUDARestoreArrayRead(A,&da);
992: MatDenseCUDARestoreArrayWrite(*B,&db);
993: (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
994: }
995: return(0);
996: }
998: #include <petsc/private/vecimpl.h>
1000: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1001: {
1002: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1003: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1004: PetscErrorCode ierr;
1005: PetscScalar *x;
1006: PetscBool viscuda;
1007: cudaError_t cerr;
1010: PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1011: if (viscuda && !v->boundtocpu) { /* update device data */
1012: VecCUDAGetArrayWrite(v,&x);
1013: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1014: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1015: } else {
1016: cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1017: }
1018: VecCUDARestoreArrayWrite(v,&x);
1019: } else { /* update host data */
1020: VecGetArrayWrite(v,&x);
1021: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1022: PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1023: } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1024: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1025: }
1026: VecRestoreArrayWrite(v,&x);
1027: }
1028: return(0);
1029: }
1031: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1032: {
1036: MatCreate(PetscObjectComm((PetscObject)A),fact);
1037: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1038: MatSetType(*fact,MATSEQDENSECUDA);
1039: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1040: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1041: } else {
1042: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1043: }
1044: (*fact)->factortype = ftype;
1046: PetscFree((*fact)->solvertype);
1047: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1048: return(0);
1049: }
1051: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1052: {
1053: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1057: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1058: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1059: MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1060: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1061: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1062: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1063: }
1064: a->vecinuse = col + 1;
1065: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1066: *v = a->cvec;
1067: return(0);
1068: }
1070: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1071: {
1072: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1076: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1077: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1078: a->vecinuse = 0;
1079: VecCUDAResetArray(a->cvec);
1080: MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1081: *v = NULL;
1082: return(0);
1083: }
1085: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1086: {
1087: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1091: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1092: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1093: MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1094: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1095: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1096: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1097: }
1098: a->vecinuse = col + 1;
1099: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1100: VecLockReadPush(a->cvec);
1101: *v = a->cvec;
1102: return(0);
1103: }
1105: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1106: {
1107: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1111: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1112: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1113: a->vecinuse = 0;
1114: VecLockReadPop(a->cvec);
1115: VecCUDAResetArray(a->cvec);
1116: MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1117: *v = NULL;
1118: return(0);
1119: }
1121: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1122: {
1123: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1127: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1128: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1129: MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1130: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1131: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1132: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1133: }
1134: a->vecinuse = col + 1;
1135: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1136: *v = a->cvec;
1137: return(0);
1138: }
1140: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1141: {
1142: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1146: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1147: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1148: a->vecinuse = 0;
1149: VecCUDAResetArray(a->cvec);
1150: MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1151: *v = NULL;
1152: return(0);
1153: }
1155: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1156: {
1157: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1158: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1159: PetscErrorCode ierr;
1162: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1163: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1164: if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1165: MatDestroy(&a->cmat);
1166: }
1167: MatSeqDenseCUDACopyToGPU(A);
1168: if (!a->cmat) {
1169: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1170: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1171: } else {
1172: MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1173: }
1174: MatDenseSetLDA(a->cmat,a->lda);
1175: if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1176: a->cmat->offloadmask = A->offloadmask;
1177: a->matinuse = cbegin + 1;
1178: *v = a->cmat;
1179: return(0);
1180: }
1182: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1183: {
1184: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1188: if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1189: if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1190: if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1191: a->matinuse = 0;
1192: A->offloadmask = PETSC_OFFLOAD_GPU;
1193: MatDenseCUDAResetArray(a->cmat);
1194: MatDenseResetArray(a->cmat);
1195: *v = NULL;
1196: return(0);
1197: }
1199: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1200: {
1201: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
1202: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1203: PetscBool data;
1206: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1207: if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1208: if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1209: cA->lda = lda;
1210: return(0);
1211: }
1213: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1214: {
1215: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1219: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1220: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1221: A->boundtocpu = flg;
1222: if (!flg) {
1223: PetscBool iscuda;
1225: PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1226: if (!iscuda) {
1227: VecDestroy(&a->cvec);
1228: }
1229: PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1230: if (!iscuda) {
1231: MatDestroy(&a->cmat);
1232: }
1233: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1234: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1235: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1236: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1237: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1238: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1239: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1240: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1241: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1242: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1243: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1244: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1246: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1247: A->ops->mult = MatMult_SeqDenseCUDA;
1248: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1249: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1250: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1251: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1252: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1253: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1254: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1255: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1256: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1257: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1258: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1259: A->ops->scale = MatScale_SeqDenseCUDA;
1260: A->ops->copy = MatCopy_SeqDenseCUDA;
1261: } else {
1262: /* make sure we have an up-to-date copy on the CPU */
1263: MatSeqDenseCUDACopyFromGPU(A);
1264: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1265: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1266: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1267: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1268: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1269: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1270: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1271: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1272: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1273: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1274: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1275: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1277: A->ops->duplicate = MatDuplicate_SeqDense;
1278: A->ops->mult = MatMult_SeqDense;
1279: A->ops->multadd = MatMultAdd_SeqDense;
1280: A->ops->multtranspose = MatMultTranspose_SeqDense;
1281: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1282: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1283: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1284: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1285: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1286: A->ops->axpy = MatAXPY_SeqDense;
1287: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1288: A->ops->lufactor = MatLUFactor_SeqDense;
1289: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1290: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1291: A->ops->scale = MatScale_SeqDense;
1292: A->ops->copy = MatCopy_SeqDense;
1293: }
1294: if (a->cmat) {
1295: MatBindToCPU(a->cmat,flg);
1296: }
1297: return(0);
1298: }
1300: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1301: {
1302: Mat B;
1303: PetscErrorCode ierr;
1306: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1307: /* TODO these cases should be optimized */
1308: MatConvert_Basic(M,type,reuse,newmat);
1309: return(0);
1310: }
1312: B = *newmat;
1313: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1314: MatReset_SeqDenseCUDA(B);
1315: PetscFree(B->defaultvectype);
1316: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1317: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1318: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1319: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1320: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1321: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1322: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1323: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1324: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1325: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1326: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1327: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1328: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);
1330: B->ops->bindtocpu = NULL;
1331: B->ops->destroy = MatDestroy_SeqDense;
1332: B->offloadmask = PETSC_OFFLOAD_CPU;
1333: return(0);
1334: }
1336: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1337: {
1338: Mat_SeqDenseCUDA *dB;
1339: Mat B;
1340: PetscErrorCode ierr;
1343: PetscCUDAInitializeCheck();
1344: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1345: /* TODO these cases should be optimized */
1346: MatConvert_Basic(M,type,reuse,newmat);
1347: return(0);
1348: }
1350: B = *newmat;
1351: PetscFree(B->defaultvectype);
1352: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1353: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1354: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1355: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1356: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1357: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1358: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1359: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1360: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1361: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1362: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1363: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1364: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);
1366: PetscNewLog(B,&dB);
1367: B->spptr = dB;
1369: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1371: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1372: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1373: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1374: return(0);
1375: }
1377: /*@C
1378: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1380: Collective
1382: Input Parameters:
1383: + comm - MPI communicator
1384: . m - number of rows
1385: . n - number of columns
1386: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1387: to control matrix memory allocation.
1389: Output Parameter:
1390: . A - the matrix
1392: Notes:
1394: Level: intermediate
1396: .seealso: MatCreate(), MatCreateSeqDense()
1397: @*/
1398: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1399: {
1401: PetscMPIInt size;
1404: MPI_Comm_size(comm,&size);
1405: if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1406: MatCreate(comm,A);
1407: MatSetSizes(*A,m,n,m,n);
1408: MatSetType(*A,MATSEQDENSECUDA);
1409: MatSeqDenseCUDASetPreallocation(*A,data);
1410: return(0);
1411: }
1413: /*MC
1414: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1416: Options Database Keys:
1417: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1419: Level: beginner
1420: M*/
1421: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1422: {
1426: PetscCUDAInitializeCheck();
1427: MatCreate_SeqDense(B);
1428: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1429: return(0);
1430: }