Actual source code: densecuda.cu

petsc-3.13.6 2020-09-29
Report Typos and Errors
  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6:  #include <../src/mat/impls/dense/seq/dense.h>
  7:  #include <petsccublas.h>

  9: /* cublas definitions are here */
 10:  #include <petsc/private/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #else /* complex double */
 25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 35: #endif
 36: #else /* real single */
 37: #if defined(PETSC_USE_REAL_SINGLE)
 38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 48: #else /* real double */
 49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 59: #endif
 60: #endif

 62: typedef struct {
 63:   PetscScalar *d_v;   /* pointer to the matrix on the GPU */
 64:   /* factorization support */
 65:   int         *d_fact_ipiv; /* device pivots */
 66:   PetscScalar *d_fact_work; /* device workspace */
 67:   int         fact_lwork;
 68:   int         *d_fact_info; /* device info */
 69:   /* workspace */
 70:   Vec         workvec;
 71: } Mat_SeqDenseCUDA;

 73: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
 74: {
 75:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
 76:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
 77:   PetscErrorCode   ierr;
 78:   cudaError_t      cerr;

 82:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
 83:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
 84:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
 85:     if (cA->lda > A->rmap->n) {
 86:       PetscInt j,m = A->rmap->n;

 88:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
 89:         cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
 90:       }
 91:     } else {
 92:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
 93:     }
 94:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
 95:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

 97:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 98:   }
 99:   return(0);
100: }

102: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
103: {
104:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
105:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
106:   PetscBool        copy;
107:   PetscErrorCode   ierr;
108:   cudaError_t      cerr;

112:   if (A->boundtocpu) return(0);
113:   if (!dA->d_v) {
114:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
115:   }
116:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
117:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
118:   if (copy) {
119:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
120:     if (cA->lda > A->rmap->n) {
121:       PetscInt j,m = A->rmap->n;

123:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
124:         cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
125:       }
126:     } else {
127:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
128:     }
129:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
130:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

132:     A->offloadmask = PETSC_OFFLOAD_BOTH;
133:   }
134:   return(0);
135: }

137: PetscErrorCode MatDenseCUDAGetArrayWrite(Mat A, PetscScalar **a)
138: {
139:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;

143:   if (!dA->d_v) {
144:     Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
145:     cudaError_t  cerr;

147:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
148:   }
149:   *a = dA->d_v;
150:   return(0);
151: }

153: PetscErrorCode MatDenseCUDARestoreArrayWrite(Mat A, PetscScalar **a)
154: {
156:   *a = NULL;
157:   A->offloadmask = PETSC_OFFLOAD_GPU;
158:   return(0);
159: }

161: PetscErrorCode MatDenseCUDAGetArrayRead(Mat A, const PetscScalar **a)
162: {
163:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
164:   PetscErrorCode   ierr;

168:   MatSeqDenseCUDACopyToGPU(A);
169:   *a   = dA->d_v;
170:   return(0);
171: }

173: PetscErrorCode MatDenseCUDARestoreArrayRead(Mat A, const PetscScalar **a)
174: {
176:   *a = NULL;
177:   return(0);
178: }

180: PetscErrorCode MatDenseCUDAGetArray(Mat A, PetscScalar **a)
181: {
182:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
183:   PetscErrorCode   ierr;

187:   MatSeqDenseCUDACopyToGPU(A);
188:   *a   = dA->d_v;
189:   return(0);
190: }

192: PetscErrorCode MatDenseCUDARestoreArray(Mat A, PetscScalar **a)
193: {
195:   *a = NULL;
196:   A->offloadmask = PETSC_OFFLOAD_GPU;
197:   return(0);
198: }

200: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
201: {
202: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
203:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
204:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
205:   PetscScalar        *da;
206:   PetscErrorCode     ierr;
207:   cudaError_t        ccer;
208:   cusolverStatus_t   cerr;
209:   cusolverDnHandle_t handle;
210:   int                n,lda;
211: #if defined(PETSC_USE_DEBUG)
212:   int                info;
213: #endif

216:   if (!A->rmap->n || !A->cmap->n) return(0);
217:   PetscCUSOLVERDnGetHandle(&handle);
218:   PetscMPIIntCast(A->cmap->n,&n);
219:   PetscMPIIntCast(a->lda,&lda);
220:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
221:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
222:     if (!dA->d_fact_ipiv) { /* spd */
223:       int il;

225:       MatDenseCUDAGetArray(A,&da);
226:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
227:       if (il > dA->fact_lwork) {
228:         dA->fact_lwork = il;

230:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
231:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
232:       }
233:       PetscLogGpuTimeBegin();
234:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
235:       ccer = WaitForGPU();CHKERRCUDA(ccer);
236:       PetscLogGpuTimeEnd();
237:       MatDenseCUDARestoreArray(A,&da);
238:       /* TODO (write cuda kernel) */
239:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
240:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
241:   }
242: #if defined(PETSC_USE_DEBUG)
243:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
244:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
245:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
246: #endif
247:   PetscLogGpuFlops(1.0*n*n*n/3.0);
248:   A->ops->solve          = NULL;
249:   A->ops->solvetranspose = NULL;
250:   A->ops->matsolve       = NULL;
251:   A->factortype          = MAT_FACTOR_NONE;

253:   PetscFree(A->solvertype);
254:   return(0);
255: #else
256:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
257: #endif
258: }

260: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
261: {
262:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
263:   Mat_SeqDense       *x = (Mat_SeqDense*)X->data;
264:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
265:   const PetscScalar  *da;
266:   PetscScalar        *dx;
267:   cusolverDnHandle_t handle;
268:   PetscBool          iscuda;
269:   int                nrhs,n,lda,ldx;
270: #if defined(PETSC_USE_DEBUG)
271:   int                info;
272: #endif
273:   cudaError_t        ccer;
274:   cusolverStatus_t   cerr;
275:   PetscErrorCode     ierr;

278:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
279:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
280:   PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
281:   if (X != B) {
282:     MatCopy(B,X,SAME_NONZERO_PATTERN);
283:   }
284:   MatDenseCUDAGetArrayRead(A,&da);
285:   /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
286:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
287:   if (!iscuda) {
288:     MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
289:   }
290:   MatDenseCUDAGetArray(X,&dx);
291:   PetscMPIIntCast(A->rmap->n,&n);
292:   PetscMPIIntCast(X->cmap->n,&nrhs);
293:   PetscMPIIntCast(a->lda,&lda);
294:   PetscMPIIntCast(x->lda,&ldx);
295:   PetscCUSOLVERDnGetHandle(&handle);
296:   PetscLogGpuTimeBegin();
297:   if (A->factortype == MAT_FACTOR_LU) {
298:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
299:     cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
300:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
301:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
302:     if (!dA->d_fact_ipiv) { /* spd */
303:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
304:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
305:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
306:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
307:   ccer = WaitForGPU();CHKERRCUDA(ccer);
308:   PetscLogGpuTimeEnd();
309:   MatDenseCUDARestoreArrayRead(A,&da);
310:   MatDenseCUDARestoreArray(X,&dx);
311:   if (!iscuda) {
312:     MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
313:   }
314: #if defined(PETSC_USE_DEBUG)
315:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
316:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
317:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
318: #endif
319:   PetscLogGpuFlops(nrhs*(2.0*n*n - n));
320:   return(0);
321: }

323: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
324: {
325:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
326:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
327:   const PetscScalar  *da;
328:   PetscScalar        *y;
329:   cusolverDnHandle_t handle;
330:   int                one = 1,n,lda;
331: #if defined(PETSC_USE_DEBUG)
332:   int                info;
333: #endif
334:   cudaError_t        ccer;
335:   cusolverStatus_t   cerr;
336:   PetscBool          iscuda;
337:   PetscErrorCode     ierr;

340:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
341:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
342:   PetscMPIIntCast(A->rmap->n,&n);
343:   /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
344:   PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
345:   if (iscuda) {
346:     VecCopy(xx,yy);
347:     VecCUDAGetArray(yy,&y);
348:   } else {
349:     if (!dA->workvec) {
350:       MatCreateVecs(A,&dA->workvec,NULL);
351:     }
352:     VecCopy(xx,dA->workvec);
353:     VecCUDAGetArray(dA->workvec,&y);
354:   }
355:   MatDenseCUDAGetArrayRead(A,&da);
356:   PetscMPIIntCast(a->lda,&lda);
357:   PetscCUSOLVERDnGetHandle(&handle);
358:   PetscLogGpuTimeBegin();
359:   if (A->factortype == MAT_FACTOR_LU) {
360:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
361:     cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
362:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
363:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
364:     if (!dA->d_fact_ipiv) { /* spd */
365:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
366:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
367:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
368:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
369:   ccer = WaitForGPU();CHKERRCUDA(ccer);
370:   PetscLogGpuTimeEnd();
371:   if (iscuda) {
372:     VecCUDARestoreArray(yy,&y);
373:   } else {
374:     VecCUDARestoreArray(dA->workvec,&y);
375:     VecCopy(dA->workvec,yy);
376:   }
377:   MatDenseCUDARestoreArrayRead(A,&da);
378: #if defined(PETSC_USE_DEBUG)
379:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
380:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
381:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
382: #endif
383:   PetscLogGpuFlops(2.0*n*n - n);
384:   return(0);
385: }

387: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
388: {
389:   PetscErrorCode     ierr;

392:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
393:   return(0);
394: }

396: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
397: {
398:   PetscErrorCode     ierr;

401:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
402:   return(0);
403: }

405: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
406: {
407:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
408:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
409:   PetscScalar        *da;
410:   int                m,n,lda;
411: #if defined(PETSC_USE_DEBUG)
412:   int                info;
413: #endif
414:   cusolverStatus_t   cerr;
415:   cusolverDnHandle_t handle;
416:   cudaError_t        ccer;
417:   PetscErrorCode     ierr;

420:   if (!A->rmap->n || !A->cmap->n) return(0);
421:   PetscCUSOLVERDnGetHandle(&handle);
422:   MatDenseCUDAGetArray(A,&da);
423:   PetscMPIIntCast(A->cmap->n,&n);
424:   PetscMPIIntCast(A->rmap->n,&m);
425:   PetscMPIIntCast(a->lda,&lda);
426:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
427:   if (!dA->d_fact_ipiv) {
428:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
429:   }
430:   if (!dA->fact_lwork) {
431:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
432:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
433:   }
434:   if (!dA->d_fact_info) {
435:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
436:   }
437:   PetscLogGpuTimeBegin();
438:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
439:   ccer = WaitForGPU();CHKERRCUDA(ccer);
440:   PetscLogGpuTimeEnd();
441:   MatDenseCUDARestoreArray(A,&da);
442: #if defined(PETSC_USE_DEBUG)
443:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
444:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
445:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
446: #endif
447:   A->factortype = MAT_FACTOR_LU;
448:   PetscLogGpuFlops(2.0*n*n*m/3.0);

450:   A->ops->solve          = MatSolve_SeqDenseCUDA;
451:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
452:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

454:   PetscFree(A->solvertype);
455:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
456:   return(0);
457: }

459: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
460: {
461:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
462:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
463:   PetscScalar        *da;
464:   int                n,lda;
465: #if defined(PETSC_USE_DEBUG)
466:   int                info;
467: #endif
468:   cusolverStatus_t   cerr;
469:   cusolverDnHandle_t handle;
470:   cudaError_t        ccer;
471:   PetscErrorCode     ierr;

474:   if (!A->rmap->n || !A->cmap->n) return(0);
475:   PetscCUSOLVERDnGetHandle(&handle);
476:   PetscMPIIntCast(A->rmap->n,&n);
477:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
478:   if (A->spd) {
479:     MatDenseCUDAGetArray(A,&da);
480:     PetscMPIIntCast(a->lda,&lda);
481:     if (!dA->fact_lwork) {
482:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
483:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
484:     }
485:     if (!dA->d_fact_info) {
486:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
487:     }
488:     PetscLogGpuTimeBegin();
489:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
490:     ccer = WaitForGPU();CHKERRCUDA(ccer);
491:     PetscLogGpuTimeEnd();

493:     MatDenseCUDARestoreArray(A,&da);
494: #if defined(PETSC_USE_DEBUG)
495:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
496:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
497:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
498: #endif
499:     A->factortype = MAT_FACTOR_CHOLESKY;
500:     PetscLogGpuFlops(1.0*n*n*n/3.0);
501:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
502: #if 0
503:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
504:        The code below should work, and it can be activated when *sytrs routines will be available */
505:     if (!dA->d_fact_ipiv) {
506:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
507:     }
508:     if (!dA->fact_lwork) {
509:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
510:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
511:     }
512:     if (!dA->d_fact_info) {
513:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
514:     }
515:     PetscLogGpuTimeBegin();
516:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
517:     PetscLogGpuTimeEnd();
518: #endif

520:   A->ops->solve          = MatSolve_SeqDenseCUDA;
521:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
522:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

524:   PetscFree(A->solvertype);
525:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
526:   return(0);
527: }

529: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
530: static PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA, PetscBool tB)
531: {
532:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
533:   Mat_SeqDense      *b = (Mat_SeqDense*)B->data;
534:   Mat_SeqDense      *c = (Mat_SeqDense*)C->data;
535:   const PetscScalar *da,*db;
536:   PetscScalar       *dc;
537:   PetscScalar       one=1.0,zero=0.0;
538:   int               m,n,k,alda,blda,clda;
539:   PetscErrorCode    ierr;
540:   cublasHandle_t    cublasv2handle;
541:   cublasStatus_t    berr;
542:   cudaError_t       cerr;

545:   PetscMPIIntCast(C->rmap->n,&m);
546:   PetscMPIIntCast(C->cmap->n,&n);
547:   if (tA) {
548:     PetscMPIIntCast(A->rmap->n,&k);
549:   } else {
550:     PetscMPIIntCast(A->cmap->n,&k);
551:   }
552:   if (!m || !n || !k) return(0);
553:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
554:   MatDenseCUDAGetArrayRead(A,&da);
555:   MatDenseCUDAGetArrayRead(B,&db);
556:   MatDenseCUDAGetArrayWrite(C,&dc);
557:   PetscMPIIntCast(a->lda,&alda);
558:   PetscMPIIntCast(b->lda,&blda);
559:   PetscMPIIntCast(c->lda,&clda);
560:   PetscCUBLASGetHandle(&cublasv2handle);
561:   PetscLogGpuTimeBegin();
562:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
563:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
564:   cerr = WaitForGPU();CHKERRCUDA(cerr);
565:   PetscLogGpuTimeEnd();
566:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
567:   MatDenseCUDARestoreArrayRead(A,&da);
568:   MatDenseCUDARestoreArrayRead(B,&db);
569:   MatDenseCUDARestoreArrayWrite(C,&dc);
570:   return(0);
571: }

573: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
574: {

578:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
579:   return(0);
580: }

582: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
583: {

587:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
588:   return(0);
589: }

591: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
592: {

596:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
597:   return(0);
598: }

600: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
601: {

605:   MatProductSetFromOptions_SeqDense(C);
606:   return(0);
607: }

609: /* zz = op(A)*xx + yy
610:    if yy == NULL, only MatMult */
611: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
612: {
613:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
614:   const PetscScalar *xarray,*da;
615:   PetscScalar       *zarray;
616:   PetscScalar       one=1.0,zero=0.0;
617:   int               m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
618:   cublasHandle_t    cublasv2handle;
619:   cublasStatus_t    berr;
620:   PetscErrorCode    ierr;

623:   if (yy && yy != zz) { /* mult add */
624:     VecCopy_SeqCUDA(yy,zz);
625:   }
626:   if (!A->rmap->n || !A->cmap->n) {
627:     if (!yy) { /* mult only */
628:       VecSet_SeqCUDA(zz,0.0);
629:     }
630:     return(0);
631:   }
632:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
633:   PetscMPIIntCast(A->rmap->n,&m);
634:   PetscMPIIntCast(A->cmap->n,&n);
635:   PetscMPIIntCast(mat->lda,&lda);
636:   PetscCUBLASGetHandle(&cublasv2handle);
637:   MatDenseCUDAGetArrayRead(A,&da);
638:   VecCUDAGetArrayRead(xx,&xarray);
639:   VecCUDAGetArray(zz,&zarray);
640:   PetscLogGpuTimeBegin();
641:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
642:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
643:   PetscLogGpuTimeEnd();
644:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
645:   VecCUDARestoreArrayRead(xx,&xarray);
646:   VecCUDARestoreArray(zz,&zarray);
647:   MatDenseCUDARestoreArrayRead(A,&da);
648:   return(0);
649: }

651: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
652: {

656:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
657:   return(0);
658: }

660: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
661: {

665:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
666:   return(0);
667: }

669: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
670: {

674:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
675:   return(0);
676: }

678: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
679: {

683:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
684:   return(0);
685: }

687: PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar *array[])
688: {
689:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

693:   MatSeqDenseCUDACopyFromGPU(A);
694:   *array = mat->v;
695:   return(0);
696: }

698: PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
699: {
700:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

704:   MatSeqDenseCUDACopyFromGPU(A);
705:   *array = mat->v;
706:   A->offloadmask = PETSC_OFFLOAD_CPU;
707:   return(0);
708: }

710: PetscErrorCode MatDenseRestoreArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
711: {
713:   return(0);
714: }

716: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
717: {
718:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
719:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
720:   const PetscScalar *dx;
721:   PetscScalar       *dy;
722:   int               j,N,m,ldax,lday,one = 1;
723:   cublasHandle_t    cublasv2handle;
724:   cublasStatus_t    berr;
725:   PetscErrorCode    ierr;
726:   cudaError_t       cerr;

729:   if (!X->rmap->n || !X->cmap->n) return(0);
730:   PetscCUBLASGetHandle(&cublasv2handle);
731:   MatDenseCUDAGetArrayRead(X,&dx);
732:   if (alpha != 0.0) {
733:     MatDenseCUDAGetArray(Y,&dy);
734:   } else {
735:     MatDenseCUDAGetArrayWrite(Y,&dy);
736:   }
737:   PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
738:   PetscMPIIntCast(X->rmap->n,&m);
739:   PetscMPIIntCast(x->lda,&ldax);
740:   PetscMPIIntCast(y->lda,&lday);
741:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
742:   PetscLogGpuTimeBegin();
743:   if (ldax>m || lday>m) {
744:     for (j=0; j<X->cmap->n; j++) {
745:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
746:     }
747:   } else {
748:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
749:   }
750:   cerr = WaitForGPU();CHKERRCUDA(cerr);
751:   PetscLogGpuTimeEnd();
752:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
753:   MatDenseCUDARestoreArrayRead(X,&dx);
754:   if (alpha != 0.0) {
755:     MatDenseCUDARestoreArray(Y,&dy);
756:   } else {
757:     MatDenseCUDARestoreArrayWrite(Y,&dy);
758:   }
759:   return(0);
760: }

762: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
763: {
764:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
765:   cudaError_t      cerr;
766:   PetscErrorCode   ierr;

769:   if (dA) {
770:     cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr);
771:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
772:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
773:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
774:     VecDestroy(&dA->workvec);
775:   }
776:   PetscFree(A->spptr);
777:   return(0);
778: }

780: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
781: {
782:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

786:   /* prevent to copy back data if we own the data pointer */
787:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
788:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
789:   MatDestroy_SeqDense(A);
790:   return(0);
791: }

793: PetscErrorCode MatSeqDenseSetPreallocation_SeqDenseCUDA(Mat B,PetscScalar *data)
794: {
795:   Mat_SeqDense     *b;
796:   Mat_SeqDenseCUDA *dB;
797:   cudaError_t      cerr;
798:   PetscErrorCode   ierr;

801:   PetscLayoutSetUp(B->rmap);
802:   PetscLayoutSetUp(B->cmap);
803:   b       = (Mat_SeqDense*)B->data;
804:   b->Mmax = B->rmap->n;
805:   b->Nmax = B->cmap->n;
806:   if (b->lda <= 0 || b->changelda) b->lda = B->rmap->n;
807:   if (b->lda < B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Invalid lda %D < %D",b->lda,B->rmap->n);

809:   PetscIntMultError(b->lda,b->Nmax,NULL);

811:   MatReset_SeqDenseCUDA(B);
812:   PetscNewLog(B,&dB);
813:   B->spptr = dB;
814:   cerr     = cudaMalloc((void**)&dB->d_v,b->lda*b->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);

816:   if (!data) { /* petsc-allocated storage */
817:     if (!b->user_alloc) { PetscFree(b->v); }
818:     PetscCalloc1((size_t)b->lda*b->Nmax,&b->v);
819:     PetscLogObjectMemory((PetscObject)B,b->lda*b->Nmax*sizeof(PetscScalar));
820:     b->user_alloc       = PETSC_FALSE;
821:   } else { /* user-allocated storage */
822:     if (!b->user_alloc) { PetscFree(b->v); }
823:     b->v                = data;
824:     b->user_alloc       = PETSC_TRUE;
825:   }
826:   B->offloadmask = PETSC_OFFLOAD_CPU;
827:   B->preallocated     = PETSC_TRUE;
828:   B->assembled        = PETSC_TRUE;
829:   return(0);
830: }

832: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
833: {

837:   MatCreate(PetscObjectComm((PetscObject)A),B);
838:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
839:   MatSetType(*B,((PetscObject)A)->type_name);
840:   MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
841:   if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
842:     Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
843:     const PetscScalar *da;
844:     PetscScalar       *db;
845:     cudaError_t       cerr;

847:     MatDenseCUDAGetArrayRead(A,&da);
848:     MatDenseCUDAGetArrayWrite(*B,&db);
849:     if (a->lda > A->rmap->n) {
850:       PetscInt j,m = A->rmap->n;

852:       for (j=0; j<A->cmap->n; j++) { /* it can be done better */
853:         cerr = cudaMemcpy(db+j*m,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
854:       }
855:     } else {
856:       cerr = cudaMemcpy(db,da,a->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
857:     }
858:     MatDenseCUDARestoreArrayRead(A,&da);
859:     MatDenseCUDARestoreArrayWrite(*B,&db);
860:     (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
861:   }
862:   return(0);
863: }

865: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
866: {

870:   MatCreate(PetscObjectComm((PetscObject)A),fact);
871:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
872:   MatSetType(*fact,MATSEQDENSECUDA);
873:   if (ftype == MAT_FACTOR_LU) {
874:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
875:   } else {
876:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
877:   }
878:   (*fact)->factortype = ftype;

880:   PetscFree((*fact)->solvertype);
881:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
882:   return(0);
883: }

885: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
886: {

890:   A->boundtocpu = flg;
891:   if (!flg) {
892:     PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDenseCUDA);
893:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",           MatDenseGetArray_SeqDenseCUDA);
894:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",       MatDenseGetArrayRead_SeqDenseCUDA);
895:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C",       MatDenseRestoreArray_SeqDenseCUDA);

897:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
898:     A->ops->mult                    = MatMult_SeqDenseCUDA;
899:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
900:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
901:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
902:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
903:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
904:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
905:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
906:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
907:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
908:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
909:   } else {
910:     /* make sure we have an up-to-date copy on the CPU */
911:     MatSeqDenseCUDACopyFromGPU(A);
912:     PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDense);
913:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",           MatDenseGetArray_SeqDense);
914:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",       MatDenseGetArray_SeqDense);
915:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C",       MatDenseRestoreArray_SeqDense);

917:     A->ops->duplicate               = MatDuplicate_SeqDense;
918:     A->ops->mult                    = MatMult_SeqDense;
919:     A->ops->multadd                 = MatMultAdd_SeqDense;
920:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
921:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
922:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
923:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
924:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
925:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
926:     A->ops->axpy                    = MatAXPY_SeqDense;
927:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
928:     A->ops->lufactor                = MatLUFactor_SeqDense;
929:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
930:  }
931:   return(0);
932: }

934: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
935: {
936:   Mat              B;
937:   PetscErrorCode   ierr;

940:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
941:     /* TODO these cases should be optimized */
942:     MatConvert_Basic(M,type,reuse,newmat);
943:     return(0);
944:   }

946:   B    = *newmat;
947:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
948:   MatReset_SeqDenseCUDA(B);
949:   PetscFree(B->defaultvectype);
950:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
951:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
952:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);

954:   B->ops->bindtocpu    = NULL;
955:   B->ops->destroy     = MatDestroy_SeqDense;
956:   B->offloadmask = PETSC_OFFLOAD_CPU;
957:   return(0);
958: }

960: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
961: {
962:   Mat_SeqDenseCUDA *dB;
963:   Mat              B;
964:   PetscErrorCode   ierr;

967:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
968:     /* TODO these cases should be optimized */
969:     MatConvert_Basic(M,type,reuse,newmat);
970:     return(0);
971:   }

973:   B    = *newmat;
974:   PetscFree(B->defaultvectype);
975:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
976:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
977:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",MatConvert_SeqDenseCUDA_SeqDense);

979:   MatReset_SeqDenseCUDA(B);
980:   PetscNewLog(B,&dB);
981:   B->spptr = dB;

983:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

985:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
986:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
987:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
988:   return(0);
989: }

991: /*MC
992:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

994:    Options Database Keys:
995: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

997:   Level: beginner

999: .seealso: MatCreateSeqDenseCUDA()

1001: M*/
1002: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1003: {

1007:   MatCreate_SeqDense(B);
1008:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1009:   return(0);
1010: }