Actual source code: densecuda.cu
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsc/private/cudavecimpl.h>
9: #if defined(PETSC_USE_COMPLEX)
10: #if defined(PETSC_USE_REAL_SINGLE)
11: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
12: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
13: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
14: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
18: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
19: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
20: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
21: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnCgeqrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
22: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnCgeqrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(cuComplex*)(g),(h),(i))
23: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnCunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(l))
24: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnCunmqr((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(cuComplex*)(l),(m),(n))
25: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasCtrsm((a),(b),(c),(d),(e),(f),(g),(cuComplex*)(h),(cuComplex*)(i),(j),(cuComplex*)(k),(l))
26: #else /* complex double */
27: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
28: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
29: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
30: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
31: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
32: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
33: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
34: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
35: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
36: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
37: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnZgeqrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
38: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnZgeqrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(cuDoubleComplex*)(g),(h),(i))
39: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnZunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(l))
40: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnZunmqr((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(m),(n))
41: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasZtrsm((a),(b),(c),(d),(e),(f),(g),(cuDoubleComplex*)(h),(cuDoubleComplex*)(i),(j),(cuDoubleComplex*)(k),(l))
42: #endif
43: #else /* real single */
44: #if defined(PETSC_USE_REAL_SINGLE)
45: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
48: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
49: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
50: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
51: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
52: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
55: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnSgeqrf_bufferSize((a),(b),(c),(float*)(d),(e),(f))
56: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnSgeqrf((a),(b),(c),(float*)(d),(e),(float*)(f),(float*)(g),(h),(i))
57: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnSormqr_bufferSize((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(l))
58: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnSormqr((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(float*)(l),(m),(n))
59: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasStrsm((a),(b),(c),(d),(e),(f),(g),(float*)(h),(float*)(i),(j),(float*)(k),(l))
60: #else /* real double */
61: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
62: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
63: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
64: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
65: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
66: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
67: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
68: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
69: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
70: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
71: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnDgeqrf_bufferSize((a),(b),(c),(double*)(d),(e),(f))
72: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i) cusolverDnDgeqrf((a),(b),(c),(double*)(d),(e),(double*)(f),(double*)(g),(h),(i))
73: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnDormqr_bufferSize((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(l))
74: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnDormqr((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(double*)(l),(m),(n))
75: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l) cublasDtrsm((a),(b),(c),(d),(e),(f),(g),(double*)(h),(double*)(i),(j),(double*)(k),(l))
76: #endif
77: #endif
79: typedef struct {
80: PetscScalar *d_v; /* pointer to the matrix on the GPU */
81: PetscBool user_alloc;
82: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
83: PetscBool unplaced_user_alloc;
84: /* factorization support */
85: PetscCuBLASInt *d_fact_ipiv; /* device pivots */
86: PetscScalar *d_fact_tau; /* device QR tau vector */
87: PetscScalar *d_fact_work; /* device workspace */
88: PetscCuBLASInt fact_lwork;
89: PetscCuBLASInt *d_fact_info; /* device info */
90: /* workspace */
91: Vec workvec;
92: } Mat_SeqDenseCUDA;
94: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
95: {
96: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
97: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
98: PetscErrorCode ierr;
99: PetscBool iscuda;
100: cudaError_t cerr;
103: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
104: if (!iscuda) return(0);
105: /* it may happen CPU preallocation has not been performed */
106: PetscLayoutSetUp(A->rmap);
107: PetscLayoutSetUp(A->cmap);
108: if (cA->lda <= 0) cA->lda = A->rmap->n;
109: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
110: if (!d_data) { /* petsc-allocated storage */
111: size_t sz;
112: PetscIntMultError(cA->lda,A->cmap->n,NULL);
113: sz = cA->lda*A->cmap->n*sizeof(PetscScalar);
114: cerr = cudaMalloc((void**)&dA->d_v,sz);CHKERRCUDA(cerr);
115: cerr = cudaMemset(dA->d_v,0,sz);CHKERRCUDA(cerr);
116: dA->user_alloc = PETSC_FALSE;
117: } else { /* user-allocated storage */
118: dA->d_v = d_data;
119: dA->user_alloc = PETSC_TRUE;
120: }
121: A->offloadmask = PETSC_OFFLOAD_GPU;
122: A->preallocated = PETSC_TRUE;
123: A->assembled = PETSC_TRUE;
124: return(0);
125: }
127: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
128: {
129: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
130: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
131: PetscErrorCode ierr;
132: cudaError_t cerr;
136: PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
137: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
138: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
139: MatSeqDenseSetPreallocation(A,NULL);
140: }
141: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
142: if (cA->lda > A->rmap->n) {
143: PetscInt n = A->cmap->n,m = A->rmap->n;
145: cerr = cudaMemcpy2D(cA->v,cA->lda*sizeof(PetscScalar),dA->d_v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146: } else {
147: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
148: }
149: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
150: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
152: A->offloadmask = PETSC_OFFLOAD_BOTH;
153: }
154: return(0);
155: }
157: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
158: {
159: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
160: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
161: PetscBool copy;
162: PetscErrorCode ierr;
163: cudaError_t cerr;
167: if (A->boundtocpu) return(0);
168: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
169: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
170: if (copy) {
171: if (!dA->d_v) { /* Allocate GPU memory if not present */
172: MatSeqDenseCUDASetPreallocation(A,NULL);
173: }
174: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
175: if (cA->lda > A->rmap->n) {
176: PetscInt n = A->cmap->n,m = A->rmap->n;
178: cerr = cudaMemcpy2D(dA->d_v,cA->lda*sizeof(PetscScalar),cA->v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179: } else {
180: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
181: }
182: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
183: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
185: A->offloadmask = PETSC_OFFLOAD_BOTH;
186: }
187: return(0);
188: }
190: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
191: {
192: Mat_SeqDense *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
193: PetscErrorCode ierr;
194: const PetscScalar *va;
195: PetscScalar *vb;
196: PetscInt lda1=a->lda,lda2=b->lda,m=A->rmap->n,n=A->cmap->n;
197: cudaError_t cerr;
200: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
201: if (A->ops->copy != B->ops->copy) {
202: MatCopy_Basic(A,B,str);
203: return(0);
204: }
205: if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
206: MatDenseCUDAGetArrayRead(A,&va);
207: MatDenseCUDAGetArrayWrite(B,&vb);
208: PetscLogGpuTimeBegin();
209: if (lda1>m || lda2>m) {
210: cerr = cudaMemcpy2D(vb,lda2*sizeof(PetscScalar),va,lda1*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
211: } else {
212: cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
213: }
214: PetscLogGpuTimeEnd();
215: MatDenseCUDARestoreArrayWrite(B,&vb);
216: MatDenseCUDARestoreArrayRead(A,&va);
217: return(0);
218: }
220: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
221: {
222: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
223: PetscErrorCode ierr;
224: PetscScalar *va;
225: PetscInt lda=a->lda,m = A->rmap->n,n = A->cmap->n;
226: cudaError_t cerr;
229: MatDenseCUDAGetArrayWrite(A,&va);
230: PetscLogGpuTimeBegin();
231: if (lda>m) {
232: cerr = cudaMemset2D(va,lda*sizeof(PetscScalar),0,m*sizeof(PetscScalar),n);CHKERRCUDA(cerr);
233: } else {
234: cerr = cudaMemset(va,0,m*(n*sizeof(PetscScalar)));CHKERRCUDA(cerr);
235: }
236: cerr = WaitForCUDA();CHKERRCUDA(cerr);
237: PetscLogGpuTimeEnd();
238: MatDenseCUDARestoreArrayWrite(A,&va);
239: return(0);
240: }
242: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
243: {
244: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
245: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
246: PetscErrorCode ierr;
249: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
250: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
251: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
252: if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
253: dA->unplacedarray = dA->d_v;
254: dA->unplaced_user_alloc = dA->user_alloc;
255: dA->d_v = (PetscScalar*)a;
256: dA->user_alloc = PETSC_TRUE;
257: return(0);
258: }
260: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
261: {
262: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
263: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
264: PetscErrorCode ierr;
267: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
268: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
269: if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
270: dA->d_v = dA->unplacedarray;
271: dA->user_alloc = dA->unplaced_user_alloc;
272: dA->unplacedarray = NULL;
273: return(0);
274: }
276: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
277: {
278: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
279: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
280: cudaError_t cerr;
283: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
284: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
285: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
286: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
287: dA->d_v = (PetscScalar*)a;
288: dA->user_alloc = PETSC_FALSE;
289: return(0);
290: }
292: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
293: {
294: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
295: PetscErrorCode ierr;
298: if (!dA->d_v) {
299: MatSeqDenseCUDASetPreallocation(A,NULL);
300: }
301: *a = dA->d_v;
302: return(0);
303: }
305: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
308: *a = NULL;
309: return(0);
310: }
312: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
313: {
314: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
315: PetscErrorCode ierr;
318: MatSeqDenseCUDACopyToGPU(A);
319: *a = dA->d_v;
320: return(0);
321: }
323: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
324: {
326: *a = NULL;
327: return(0);
328: }
330: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
331: {
332: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
333: PetscErrorCode ierr;
336: MatSeqDenseCUDACopyToGPU(A);
337: *a = dA->d_v;
338: return(0);
339: }
341: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
342: {
344: *a = NULL;
345: return(0);
346: }
348: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
349: {
350: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
351: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
352: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
353: PetscScalar *da;
354: PetscErrorCode ierr;
355: cudaError_t ccer;
356: cusolverStatus_t cerr;
357: cusolverDnHandle_t handle;
358: PetscCuBLASInt n,lda;
359: #if defined(PETSC_USE_DEBUG)
360: PetscCuBLASInt info;
361: #endif
364: if (!A->rmap->n || !A->cmap->n) return(0);
365: PetscCUSOLVERDnGetHandle(&handle);
366: PetscCuBLASIntCast(A->cmap->n,&n);
367: PetscCuBLASIntCast(a->lda,&lda);
368: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
369: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
370: if (!dA->d_fact_ipiv) { /* spd */
371: PetscCuBLASInt il;
373: MatDenseCUDAGetArray(A,&da);
374: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
375: if (il > dA->fact_lwork) {
376: dA->fact_lwork = il;
378: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
379: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
380: }
381: PetscLogGpuTimeBegin();
382: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
383: PetscLogGpuTimeEnd();
384: MatDenseCUDARestoreArray(A,&da);
385: /* TODO (write cuda kernel) */
386: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
387: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
388: }
389: #if defined(PETSC_USE_DEBUG)
390: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
391: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
392: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
393: #endif
394: PetscLogGpuFlops(1.0*n*n*n/3.0);
395: A->ops->solve = NULL;
396: A->ops->solvetranspose = NULL;
397: A->ops->matsolve = NULL;
398: A->factortype = MAT_FACTOR_NONE;
400: PetscFree(A->solvertype);
401: return(0);
402: #else
403: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
404: #endif
405: }
407: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose,
408: PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
409: {
410: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
411: PetscScalar *y;
412: PetscCuBLASInt m=0, k=0;
413: PetscBool xiscuda, yiscuda, aiscuda;
414: cudaError_t cerr;
415: PetscErrorCode ierr;
418: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
419: PetscCuBLASIntCast(A->rmap->n,&m);
420: PetscCuBLASIntCast(A->cmap->n,&k);
421: PetscObjectTypeCompare((PetscObject)xx,VECSEQCUDA,&xiscuda);
422: PetscObjectTypeCompare((PetscObject)yy,VECSEQCUDA,&yiscuda);
423: {
424: const PetscScalar *x;
425: PetscBool xishost = PETSC_TRUE;
427: /* The logic here is to try to minimize the amount of memory copying:
428: if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
429: data is not offloaded to the GPU yet, then the data is copied to the
430: GPU. But we are only trying to get the data in order to copy it into the y
431: array. So the array x will be wherever the data already is so that
432: only one memcpy is performed */
433: if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
434: VecCUDAGetArrayRead(xx, &x);
435: xishost = PETSC_FALSE;
436: } else {
437: VecGetArrayRead(xx, &x);
438: }
439: if (k < m || !yiscuda) {
440: if (!dA->workvec) {
441: VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
442: }
443: VecCUDAGetArrayWrite(dA->workvec, &y);
444: } else {
445: VecCUDAGetArrayWrite(yy,&y);
446: }
447: cerr = cudaMemcpy(y,x,m*sizeof(PetscScalar),xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
448: }
449: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
450: if (!aiscuda) {
451: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
452: }
453: (*matsolve) (A, y, m, m, 1, k, transpose);
454: if (!aiscuda) {
455: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
456: }
457: if (k < m || !yiscuda) {
458: PetscScalar *yv;
460: /* The logic here is that the data is not yet in either yy's GPU array or its
461: CPU array. There is nothing in the interface to say where the user would like
462: it to end up. So we choose the GPU, because it is the faster option */
463: if (yiscuda) {
464: VecCUDAGetArrayWrite(yy,&yv);
465: } else {
466: VecGetArray(yy,&yv);
467: }
468: cerr = cudaMemcpy(yv,y,k*sizeof(PetscScalar),yiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
469: if (yiscuda) {
470: VecCUDARestoreArrayWrite(yy,&yv);
471: } else {
472: VecRestoreArray(yy,&yv);
473: }
474: VecCUDARestoreArrayWrite(dA->workvec, &y);
475: } else {
476: VecCUDARestoreArrayWrite(yy,&y);
477: }
478: return(0);
479: }
481: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose,
482: PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
483: {
484: PetscScalar *y;
485: PetscInt n, _ldb, _ldx;
486: PetscBool biscuda, xiscuda, aiscuda;
487: PetscCuBLASInt nrhs=0,m=0,k=0,ldb=0,ldx=0,ldy=0;
488: cudaError_t cerr;
489: PetscErrorCode ierr;
492: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
493: PetscCuBLASIntCast(A->rmap->n,&m);
494: PetscCuBLASIntCast(A->cmap->n,&k);
495: MatGetSize(B,NULL,&n);
496: PetscCuBLASIntCast(n,&nrhs);
497: MatDenseGetLDA(B,&_ldb);
498: PetscCuBLASIntCast(_ldb, &ldb);
499: MatDenseGetLDA(X,&_ldx);
500: PetscCuBLASIntCast(_ldx, &ldx);
502: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
503: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&xiscuda);
504: {
505: /* The logic here is to try to minimize the amount of memory copying:
506: if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
507: data is not offloaded to the GPU yet, then the data is copied to the
508: GPU. But we are only trying to get the data in order to copy it into the y
509: array. So the array b will be wherever the data already is so that
510: only one memcpy is performed */
511: const PetscScalar *b;
513: /* some copying from B will be involved */
514: PetscBool bishost = PETSC_TRUE;
516: if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
517: MatDenseCUDAGetArrayRead(B,&b);
518: bishost = PETSC_FALSE;
519: } else {
520: MatDenseGetArrayRead(B,&b);
521: }
522: if (ldx < m || !xiscuda) {
523: /* X's array cannot serve as the array (too small or not on device), B's
524: * array cannot serve as the array (const), so allocate a new array */
525: ldy = m;
526: cerr = cudaMalloc((void**)&y,nrhs*m*sizeof(PetscScalar));CHKERRCUDA(cerr);
527: } else {
528: /* X's array should serve as the array */
529: ldy = ldx;
530: MatDenseCUDAGetArrayWrite(X,&y);
531: }
532: cerr = cudaMemcpy2D(y,ldy*sizeof(PetscScalar),b,ldb*sizeof(PetscScalar),m*sizeof(PetscScalar),nrhs,bishost ? cudaMemcpyHostToDevice: cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
533: if (bishost) {
534: MatDenseRestoreArrayRead(B,&b);
535: } else {
536: MatDenseCUDARestoreArrayRead(B,&b);
537: }
538: }
539: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
540: if (!aiscuda) {
541: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
542: }
543: (*matsolve) (A, y, ldy, m, nrhs, k, transpose);
544: if (!aiscuda) {
545: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
546: }
547: if (ldx < m || !xiscuda) {
548: PetscScalar *x;
550: /* The logic here is that the data is not yet in either X's GPU array or its
551: CPU array. There is nothing in the interface to say where the user would like
552: it to end up. So we choose the GPU, because it is the faster option */
553: if (xiscuda) {
554: MatDenseCUDAGetArrayWrite(X,&x);
555: } else {
556: MatDenseGetArray(X,&x);
557: }
558: cerr = cudaMemcpy2D(x,ldx*sizeof(PetscScalar),y,ldy*sizeof(PetscScalar),k*sizeof(PetscScalar),nrhs,xiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
559: if (xiscuda) {
560: MatDenseCUDARestoreArrayWrite(X,&x);
561: } else {
562: MatDenseRestoreArray(X,&x);
563: }
564: cerr = cudaFree(y);CHKERRCUDA(cerr);
565: } else {
566: MatDenseCUDARestoreArrayWrite(X,&y);
567: }
568: return(0);
569: }
571: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
572: {
573: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
574: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
575: const PetscScalar *da;
576: PetscCuBLASInt lda;
577: cusolverDnHandle_t handle;
578: cudaError_t ccer;
579: cusolverStatus_t cerr;
580: int info;
581: PetscErrorCode ierr;
584: PetscCuBLASIntCast(mat->lda,&lda);
585: MatDenseCUDAGetArrayRead(A,&da);
586: PetscCUSOLVERDnGetHandle(&handle);
587: PetscLogGpuTimeBegin();
588: PetscInfo2(A,"LU solve %d x %d on backend\n",m,k);
589: cerr = cusolverDnXgetrs(handle,T ? CUBLAS_OP_T : CUBLAS_OP_N,m,nrhs,da,lda,dA->d_fact_ipiv,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
590: PetscLogGpuTimeEnd();
591: MatDenseCUDARestoreArrayRead(A,&da);
592: if (PetscDefined(USE_DEBUG)) {
593: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
594: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
595: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
596: }
597: PetscLogGpuFlops(nrhs*(2.0*m*m - m));
598: return(0);
599: }
601: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
602: {
603: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
604: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
605: const PetscScalar *da;
606: PetscCuBLASInt lda;
607: cusolverDnHandle_t handle;
608: cudaError_t ccer;
609: cusolverStatus_t cerr;
610: int info;
611: PetscErrorCode ierr;
614: PetscCuBLASIntCast(mat->lda,&lda);
615: MatDenseCUDAGetArrayRead(A,&da);
616: PetscCUSOLVERDnGetHandle(&handle);
617: PetscLogGpuTimeBegin();
618: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",m,k);
619: if (!dA->d_fact_ipiv) { /* spd */
620: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
621: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,m,nrhs,da,lda,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
622: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
623: PetscLogGpuTimeEnd();
624: MatDenseCUDARestoreArrayRead(A,&da);
625: if (PetscDefined(USE_DEBUG)) {
626: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
627: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
628: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
629: }
630: PetscLogGpuFlops(nrhs*(2.0*m*m - m));
631: return(0);
632: }
634: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
635: {
636: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
637: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
638: const PetscScalar *da;
639: PetscCuBLASInt lda, rank;
640: cusolverDnHandle_t handle;
641: cublasHandle_t bhandle;
642: cudaError_t ccer;
643: cusolverStatus_t csrr;
644: cublasStatus_t cbrr;
645: int info;
646: cublasOperation_t trans;
647: PetscScalar one = 1.;
648: PetscErrorCode ierr;
651: PetscCuBLASIntCast(mat->lda,&lda);
652: PetscCuBLASIntCast(mat->rank,&rank);
653: MatDenseCUDAGetArrayRead(A,&da);
654: PetscCUSOLVERDnGetHandle(&handle);
655: PetscCUBLASGetHandle(&bhandle);
656: PetscLogGpuTimeBegin();
657: PetscInfo2(A,"QR solve %d x %d on backend\n",m,k);
658: if (!T) {
659: if (PetscDefined(USE_COMPLEX)) {
660: trans = CUBLAS_OP_C;
661: } else {
662: trans = CUBLAS_OP_T;
663: }
664: csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
665: if (PetscDefined(USE_DEBUG)) {
666: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
667: if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
668: }
669: cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
670: } else {
671: cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
672: csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
673: if (PetscDefined(USE_DEBUG)) {
674: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
675: if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
676: }
677: }
678: PetscLogGpuTimeEnd();
679: MatDenseCUDARestoreArrayRead(A,&da);
680: PetscLogFlops(nrhs*(4.0*m*mat->rank - PetscSqr(mat->rank)));
681: return(0);
682: }
684: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
685: {
689: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
690: return(0);
691: }
693: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
694: {
698: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
699: return(0);
700: }
702: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
703: {
707: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
708: return(0);
709: }
711: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
712: {
716: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
717: return(0);
718: }
720: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
721: {
725: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
726: return(0);
727: }
729: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
730: {
734: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
735: return(0);
736: }
738: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
739: {
743: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
744: return(0);
745: }
747: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
748: {
752: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
753: return(0);
754: }
756: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
757: {
761: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
762: return(0);
763: }
765: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
766: {
770: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
771: return(0);
772: }
774: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
775: {
779: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
780: return(0);
781: }
783: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
784: {
788: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
789: return(0);
790: }
792: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
793: {
794: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
795: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
796: PetscScalar *da;
797: PetscCuBLASInt m,n,lda;
798: #if defined(PETSC_USE_DEBUG)
799: int info;
800: #endif
801: cusolverStatus_t cerr;
802: cusolverDnHandle_t handle;
803: cudaError_t ccer;
804: PetscErrorCode ierr;
807: if (!A->rmap->n || !A->cmap->n) return(0);
808: PetscCUSOLVERDnGetHandle(&handle);
809: MatDenseCUDAGetArray(A,&da);
810: PetscCuBLASIntCast(A->cmap->n,&n);
811: PetscCuBLASIntCast(A->rmap->n,&m);
812: PetscCuBLASIntCast(a->lda,&lda);
813: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
814: if (!dA->d_fact_ipiv) {
815: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
816: }
817: if (!dA->fact_lwork) {
818: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
819: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
820: }
821: if (!dA->d_fact_info) {
822: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
823: }
824: PetscLogGpuTimeBegin();
825: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
826: PetscLogGpuTimeEnd();
827: MatDenseCUDARestoreArray(A,&da);
828: #if defined(PETSC_USE_DEBUG)
829: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
830: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
831: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
832: #endif
833: A->factortype = MAT_FACTOR_LU;
834: PetscLogGpuFlops(2.0*n*n*m/3.0);
836: A->ops->solve = MatSolve_SeqDenseCUDA_LU;
837: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_LU;
838: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_LU;
839: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;
841: PetscFree(A->solvertype);
842: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
843: return(0);
844: }
846: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
847: {
848: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
849: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
850: PetscScalar *da;
851: PetscCuBLASInt n,lda;
852: #if defined(PETSC_USE_DEBUG)
853: int info;
854: #endif
855: cusolverStatus_t cerr;
856: cusolverDnHandle_t handle;
857: cudaError_t ccer;
858: PetscErrorCode ierr;
861: if (!A->rmap->n || !A->cmap->n) return(0);
862: PetscCUSOLVERDnGetHandle(&handle);
863: PetscCuBLASIntCast(A->rmap->n,&n);
864: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
865: if (A->spd) {
866: MatDenseCUDAGetArray(A,&da);
867: PetscCuBLASIntCast(a->lda,&lda);
868: if (!dA->fact_lwork) {
869: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
870: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
871: }
872: if (!dA->d_fact_info) {
873: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
874: }
875: PetscLogGpuTimeBegin();
876: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
877: PetscLogGpuTimeEnd();
879: MatDenseCUDARestoreArray(A,&da);
880: #if defined(PETSC_USE_DEBUG)
881: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
882: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
883: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
884: #endif
885: A->factortype = MAT_FACTOR_CHOLESKY;
886: PetscLogGpuFlops(1.0*n*n*n/3.0);
887: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
888: #if 0
889: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
890: The code below should work, and it can be activated when *sytrs routines will be available */
891: if (!dA->d_fact_ipiv) {
892: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
893: }
894: if (!dA->fact_lwork) {
895: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
896: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
897: }
898: if (!dA->d_fact_info) {
899: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
900: }
901: PetscLogGpuTimeBegin();
902: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
903: PetscLogGpuTimeEnd();
904: #endif
906: A->ops->solve = MatSolve_SeqDenseCUDA_Cholesky;
907: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_Cholesky;
908: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_Cholesky;
909: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
910: PetscFree(A->solvertype);
911: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
912: return(0);
913: }
915: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A,IS col,const MatFactorInfo *factinfo)
916: {
917: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
918: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
919: PetscScalar *da;
920: PetscCuBLASInt m,min,max,n,lda;
921: #if defined(PETSC_USE_DEBUG)
922: int info;
923: #endif
924: cusolverStatus_t cerr;
925: cusolverDnHandle_t handle;
926: cudaError_t ccer;
927: PetscErrorCode ierr;
930: if (!A->rmap->n || !A->cmap->n) return(0);
931: PetscCUSOLVERDnGetHandle(&handle);
932: MatDenseCUDAGetArray(A,&da);
933: PetscCuBLASIntCast(A->cmap->n,&n);
934: PetscCuBLASIntCast(A->rmap->n,&m);
935: PetscCuBLASIntCast(a->lda,&lda);
936: PetscInfo2(A,"QR factor %d x %d on backend\n",m,n);
937: max = PetscMax(m,n);
938: min = PetscMin(m,n);
939: if (!dA->d_fact_tau) {
940: ccer = cudaMalloc((void**)&dA->d_fact_tau,min*sizeof(*dA->d_fact_tau));CHKERRCUDA(ccer);
941: }
942: if (!dA->d_fact_ipiv) {
943: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
944: }
945: if (!dA->fact_lwork) {
946: cerr = cusolverDnXgeqrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
947: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
948: }
949: if (!dA->d_fact_info) {
950: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
951: }
952: if (!dA->workvec) {
953: VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
954: }
955: PetscLogGpuTimeBegin();
956: cerr = cusolverDnXgeqrf(handle,m,n,da,lda,dA->d_fact_tau,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
957: PetscLogGpuTimeEnd();
958: MatDenseCUDARestoreArray(A,&da);
959: #if defined(PETSC_USE_DEBUG)
960: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
961: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
962: #endif
963: A->factortype = MAT_FACTOR_QR;
964: a->rank = min;
965: PetscLogGpuFlops(2.0*min*min*(max-min/3.0));
967: A->ops->solve = MatSolve_SeqDenseCUDA_QR;
968: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_QR;
969: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_QR;
970: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;
972: PetscFree(A->solvertype);
973: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
974: return(0);
975: }
977: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
978: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
979: {
980: const PetscScalar *da,*db;
981: PetscScalar *dc;
982: PetscScalar one=1.0,zero=0.0;
983: PetscCuBLASInt m,n,k;
984: PetscInt alda,blda,clda;
985: PetscErrorCode ierr;
986: cublasHandle_t cublasv2handle;
987: PetscBool Aiscuda,Biscuda;
988: cublasStatus_t berr;
991: /* we may end up with SEQDENSE as one of the arguments */
992: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
993: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
994: if (!Aiscuda) {
995: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
996: }
997: if (!Biscuda) {
998: MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
999: }
1000: PetscCuBLASIntCast(C->rmap->n,&m);
1001: PetscCuBLASIntCast(C->cmap->n,&n);
1002: if (tA) {
1003: PetscCuBLASIntCast(A->rmap->n,&k);
1004: } else {
1005: PetscCuBLASIntCast(A->cmap->n,&k);
1006: }
1007: if (!m || !n || !k) return(0);
1008: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
1009: MatDenseCUDAGetArrayRead(A,&da);
1010: MatDenseCUDAGetArrayRead(B,&db);
1011: MatDenseCUDAGetArrayWrite(C,&dc);
1012: MatDenseGetLDA(A,&alda);
1013: MatDenseGetLDA(B,&blda);
1014: MatDenseGetLDA(C,&clda);
1015: PetscCUBLASGetHandle(&cublasv2handle);
1016: PetscLogGpuTimeBegin();
1017: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
1018: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
1019: PetscLogGpuTimeEnd();
1020: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
1021: MatDenseCUDARestoreArrayRead(A,&da);
1022: MatDenseCUDARestoreArrayRead(B,&db);
1023: MatDenseCUDARestoreArrayWrite(C,&dc);
1024: if (!Aiscuda) {
1025: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1026: }
1027: if (!Biscuda) {
1028: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1029: }
1030: return(0);
1031: }
1033: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1034: {
1038: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
1039: return(0);
1040: }
1042: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1043: {
1047: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
1048: return(0);
1049: }
1051: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1052: {
1056: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
1057: return(0);
1058: }
1060: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
1061: {
1065: MatProductSetFromOptions_SeqDense(C);
1066: return(0);
1067: }
1069: /* zz = op(A)*xx + yy
1070: if yy == NULL, only MatMult */
1071: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
1072: {
1073: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1074: const PetscScalar *xarray,*da;
1075: PetscScalar *zarray;
1076: PetscScalar one=1.0,zero=0.0;
1077: PetscCuBLASInt m, n, lda;
1078: cublasHandle_t cublasv2handle;
1079: cublasStatus_t berr;
1080: PetscErrorCode ierr;
1083: if (yy && yy != zz) { /* mult add */
1084: VecCopy_SeqCUDA(yy,zz);
1085: }
1086: if (!A->rmap->n || !A->cmap->n) {
1087: if (!yy) { /* mult only */
1088: VecSet_SeqCUDA(zz,0.0);
1089: }
1090: return(0);
1091: }
1092: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
1093: PetscCuBLASIntCast(A->rmap->n,&m);
1094: PetscCuBLASIntCast(A->cmap->n,&n);
1095: PetscCuBLASIntCast(mat->lda,&lda);
1096: PetscCUBLASGetHandle(&cublasv2handle);
1097: MatDenseCUDAGetArrayRead(A,&da);
1098: VecCUDAGetArrayRead(xx,&xarray);
1099: VecCUDAGetArray(zz,&zarray);
1100: PetscLogGpuTimeBegin();
1101: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
1102: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
1103: PetscLogGpuTimeEnd();
1104: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
1105: VecCUDARestoreArrayRead(xx,&xarray);
1106: VecCUDARestoreArray(zz,&zarray);
1107: MatDenseCUDARestoreArrayRead(A,&da);
1108: return(0);
1109: }
1111: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1112: {
1116: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
1117: return(0);
1118: }
1120: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1121: {
1125: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
1126: return(0);
1127: }
1129: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1130: {
1134: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
1135: return(0);
1136: }
1138: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1139: {
1143: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
1144: return(0);
1145: }
1147: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
1148: {
1149: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1153: MatSeqDenseCUDACopyFromGPU(A);
1154: *array = mat->v;
1155: return(0);
1156: }
1158: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
1159: {
1160: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1164: if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
1165: MatSeqDenseSetPreallocation(A,NULL);
1166: }
1167: *array = mat->v;
1168: A->offloadmask = PETSC_OFFLOAD_CPU;
1169: return(0);
1170: }
1172: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
1173: {
1174: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
1178: MatSeqDenseCUDACopyFromGPU(A);
1179: *array = mat->v;
1180: A->offloadmask = PETSC_OFFLOAD_CPU;
1181: return(0);
1182: }
1184: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
1185: {
1186: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
1187: PetscScalar *dy;
1188: PetscCuBLASInt j,N,m,lday,one = 1;
1189: cublasHandle_t cublasv2handle;
1190: cublasStatus_t berr;
1194: PetscCUBLASGetHandle(&cublasv2handle);
1195: MatDenseCUDAGetArray(Y,&dy);
1196: PetscCuBLASIntCast(Y->rmap->n*Y->cmap->n,&N);
1197: PetscCuBLASIntCast(Y->rmap->n,&m);
1198: PetscCuBLASIntCast(y->lda,&lday);
1199: PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1200: PetscLogGpuTimeBegin();
1201: if (lday>m) {
1202: for (j=0; j<Y->cmap->n; j++) {
1203: berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
1204: }
1205: } else {
1206: berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
1207: }
1208: PetscLogGpuTimeEnd();
1209: PetscLogGpuFlops(N);
1210: MatDenseCUDARestoreArray(Y,&dy);
1211: return(0);
1212: }
1214: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
1215: {
1216: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
1217: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
1218: const PetscScalar *dx;
1219: PetscScalar *dy;
1220: PetscCuBLASInt j,N,m,ldax,lday,one = 1;
1221: cublasHandle_t cublasv2handle;
1222: cublasStatus_t berr;
1223: PetscErrorCode ierr;
1226: if (!X->rmap->n || !X->cmap->n) return(0);
1227: PetscCUBLASGetHandle(&cublasv2handle);
1228: MatDenseCUDAGetArrayRead(X,&dx);
1229: if (alpha != 0.0) {
1230: MatDenseCUDAGetArray(Y,&dy);
1231: } else {
1232: MatDenseCUDAGetArrayWrite(Y,&dy);
1233: }
1234: PetscCuBLASIntCast(X->rmap->n*X->cmap->n,&N);
1235: PetscCuBLASIntCast(X->rmap->n,&m);
1236: PetscCuBLASIntCast(x->lda,&ldax);
1237: PetscCuBLASIntCast(y->lda,&lday);
1238: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1239: PetscLogGpuTimeBegin();
1240: if (ldax>m || lday>m) {
1241: for (j=0; j<X->cmap->n; j++) {
1242: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
1243: }
1244: } else {
1245: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
1246: }
1247: PetscLogGpuTimeEnd();
1248: PetscLogGpuFlops(PetscMax(2.*N-1,0));
1249: MatDenseCUDARestoreArrayRead(X,&dx);
1250: if (alpha != 0.0) {
1251: MatDenseCUDARestoreArray(Y,&dy);
1252: } else {
1253: MatDenseCUDARestoreArrayWrite(Y,&dy);
1254: }
1255: return(0);
1256: }
1258: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1259: {
1260: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1261: cudaError_t cerr;
1262: PetscErrorCode ierr;
1265: if (dA) {
1266: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
1267: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
1268: cerr = cudaFree(dA->d_fact_tau);CHKERRCUDA(cerr);
1269: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
1270: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
1271: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
1272: VecDestroy(&dA->workvec);
1273: }
1274: PetscFree(A->spptr);
1275: return(0);
1276: }
1278: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1279: {
1280: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1284: /* prevent to copy back data if we own the data pointer */
1285: if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
1286: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1287: MatDestroy_SeqDense(A);
1288: return(0);
1289: }
1291: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
1292: {
1293: PetscErrorCode ierr;
1294: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1297: MatCreate(PetscObjectComm((PetscObject)A),B);
1298: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1299: MatSetType(*B,((PetscObject)A)->type_name);
1300: MatDuplicateNoCreate_SeqDense(*B,A,hcpvalues);
1301: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) {
1302: MatCopy_SeqDenseCUDA(A,*B,SAME_NONZERO_PATTERN);
1303: }
1304: return(0);
1305: }
1307: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1308: {
1309: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1310: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1311: PetscErrorCode ierr;
1312: PetscScalar *x;
1313: PetscBool viscuda;
1314: cudaError_t cerr;
1317: PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1318: if (viscuda && !v->boundtocpu) { /* update device data */
1319: VecCUDAGetArrayWrite(v,&x);
1320: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1321: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1322: } else {
1323: cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1324: }
1325: VecCUDARestoreArrayWrite(v,&x);
1326: } else { /* update host data */
1327: VecGetArrayWrite(v,&x);
1328: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1329: PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1330: } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1331: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1332: }
1333: VecRestoreArrayWrite(v,&x);
1334: }
1335: return(0);
1336: }
1338: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1339: {
1343: MatCreate(PetscObjectComm((PetscObject)A),fact);
1344: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1345: MatSetType(*fact,MATSEQDENSECUDA);
1346: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1347: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1348: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1349: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1350: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1351: } else if (ftype == MAT_FACTOR_QR) {
1352: PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactor_C",MatQRFactor_SeqDense);
1353: PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactorSymbolic_C",MatQRFactorSymbolic_SeqDense);
1354: }
1355: (*fact)->factortype = ftype;
1356: PetscFree((*fact)->solvertype);
1357: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1358: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1359: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1360: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1361: PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1362: return(0);
1363: }
1365: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1366: {
1367: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1371: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1372: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1373: MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1374: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1375: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1376: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1377: }
1378: a->vecinuse = col + 1;
1379: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1380: *v = a->cvec;
1381: return(0);
1382: }
1384: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1385: {
1386: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1390: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1391: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1392: a->vecinuse = 0;
1393: VecCUDAResetArray(a->cvec);
1394: MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1395: *v = NULL;
1396: return(0);
1397: }
1399: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1400: {
1401: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1405: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1406: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1407: MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1408: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1409: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1410: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1411: }
1412: a->vecinuse = col + 1;
1413: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1414: VecLockReadPush(a->cvec);
1415: *v = a->cvec;
1416: return(0);
1417: }
1419: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1420: {
1421: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1425: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1426: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1427: a->vecinuse = 0;
1428: VecLockReadPop(a->cvec);
1429: VecCUDAResetArray(a->cvec);
1430: MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1431: *v = NULL;
1432: return(0);
1433: }
1435: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1436: {
1437: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1441: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1442: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1443: MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1444: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1445: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1446: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1447: }
1448: a->vecinuse = col + 1;
1449: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1450: *v = a->cvec;
1451: return(0);
1452: }
1454: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1455: {
1456: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1460: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1461: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1462: a->vecinuse = 0;
1463: VecCUDAResetArray(a->cvec);
1464: MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1465: *v = NULL;
1466: return(0);
1467: }
1469: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1470: {
1471: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1472: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1473: PetscErrorCode ierr;
1476: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1477: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1478: if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1479: MatDestroy(&a->cmat);
1480: }
1481: MatSeqDenseCUDACopyToGPU(A);
1482: if (!a->cmat) {
1483: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1484: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1485: } else {
1486: MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1487: }
1488: MatDenseSetLDA(a->cmat,a->lda);
1489: if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1490: a->cmat->offloadmask = A->offloadmask;
1491: a->matinuse = cbegin + 1;
1492: *v = a->cmat;
1493: return(0);
1494: }
1496: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1497: {
1498: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1502: if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1503: if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1504: if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1505: a->matinuse = 0;
1506: A->offloadmask = PETSC_OFFLOAD_GPU;
1507: MatDenseCUDAResetArray(a->cmat);
1508: MatDenseResetArray(a->cmat);
1509: *v = NULL;
1510: return(0);
1511: }
1513: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1514: {
1515: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
1516: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1517: PetscBool data;
1520: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1521: if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1522: if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1523: cA->lda = lda;
1524: return(0);
1525: }
1527: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1528: {
1529: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1533: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1534: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1535: A->boundtocpu = flg;
1536: if (!flg) {
1537: PetscBool iscuda;
1539: PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1540: if (!iscuda) {
1541: VecDestroy(&a->cvec);
1542: }
1543: PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1544: if (!iscuda) {
1545: MatDestroy(&a->cmat);
1546: }
1547: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1548: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1549: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1550: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1551: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1552: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1553: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1554: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1555: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1556: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1557: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1558: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1559: PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDenseCUDA);
1561: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1562: A->ops->mult = MatMult_SeqDenseCUDA;
1563: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1564: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1565: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1566: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1567: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1568: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1569: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1570: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1571: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1572: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1573: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1574: A->ops->scale = MatScale_SeqDenseCUDA;
1575: A->ops->copy = MatCopy_SeqDenseCUDA;
1576: A->ops->zeroentries = MatZeroEntries_SeqDenseCUDA;
1577: } else {
1578: /* make sure we have an up-to-date copy on the CPU */
1579: MatSeqDenseCUDACopyFromGPU(A);
1580: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1581: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1582: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1583: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1584: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1585: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1586: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1587: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1588: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1589: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1590: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1591: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1592: PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDense);
1594: A->ops->duplicate = MatDuplicate_SeqDense;
1595: A->ops->mult = MatMult_SeqDense;
1596: A->ops->multadd = MatMultAdd_SeqDense;
1597: A->ops->multtranspose = MatMultTranspose_SeqDense;
1598: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1599: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1600: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1601: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1602: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1603: A->ops->axpy = MatAXPY_SeqDense;
1604: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1605: A->ops->lufactor = MatLUFactor_SeqDense;
1606: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1607: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1608: A->ops->scale = MatScale_SeqDense;
1609: A->ops->copy = MatCopy_SeqDense;
1610: A->ops->zeroentries = MatZeroEntries_SeqDense;
1611: }
1612: if (a->cmat) {
1613: MatBindToCPU(a->cmat,flg);
1614: }
1615: return(0);
1616: }
1618: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1619: {
1620: Mat B;
1621: Mat_SeqDense *a;
1622: PetscErrorCode ierr;
1625: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1626: /* TODO these cases should be optimized */
1627: MatConvert_Basic(M,type,reuse,newmat);
1628: return(0);
1629: }
1631: B = *newmat;
1632: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1633: MatReset_SeqDenseCUDA(B);
1634: PetscFree(B->defaultvectype);
1635: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1636: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1637: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1638: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1639: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1640: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1641: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1642: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1643: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1644: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1645: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1646: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1647: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);
1648: a = (Mat_SeqDense*)B->data;
1649: VecDestroy(&a->cvec); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1650: B->ops->bindtocpu = NULL;
1651: B->ops->destroy = MatDestroy_SeqDense;
1652: B->offloadmask = PETSC_OFFLOAD_CPU;
1653: return(0);
1654: }
1656: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1657: {
1658: Mat_SeqDenseCUDA *dB;
1659: Mat B;
1660: Mat_SeqDense *a;
1661: PetscErrorCode ierr;
1664: PetscCUDAInitializeCheck();
1665: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1666: /* TODO these cases should be optimized */
1667: MatConvert_Basic(M,type,reuse,newmat);
1668: return(0);
1669: }
1671: B = *newmat;
1672: PetscFree(B->defaultvectype);
1673: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1674: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1675: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1676: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1677: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1678: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1679: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1680: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1681: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1682: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1683: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1684: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1685: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);
1686: a = (Mat_SeqDense*)B->data;
1687: VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1688: PetscNewLog(B,&dB);
1689: B->spptr = dB;
1691: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1693: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1694: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1695: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1696: return(0);
1697: }
1699: /*@C
1700: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1702: Collective
1704: Input Parameters:
1705: + comm - MPI communicator
1706: . m - number of rows
1707: . n - number of columns
1708: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1709: to control matrix memory allocation.
1711: Output Parameter:
1712: . A - the matrix
1714: Notes:
1716: Level: intermediate
1718: .seealso: MatCreate(), MatCreateSeqDense()
1719: @*/
1720: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1721: {
1723: PetscMPIInt size;
1726: MPI_Comm_size(comm,&size);
1727: if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1728: MatCreate(comm,A);
1729: MatSetSizes(*A,m,n,m,n);
1730: MatSetType(*A,MATSEQDENSECUDA);
1731: MatSeqDenseCUDASetPreallocation(*A,data);
1732: return(0);
1733: }
1735: /*MC
1736: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1738: Options Database Keys:
1739: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1741: Level: beginner
1742: M*/
1743: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1744: {
1748: PetscCUDAInitializeCheck();
1749: MatCreate_SeqDense(B);
1750: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1751: return(0);
1752: }