Actual source code: matseqdensecupm.hpp
1: #pragma once
3: #include <petsc/private/matdensecupmimpl.h>
4: #include <../src/mat/impls/dense/seq/dense.h>
6: #include <petsc/private/deviceimpl.h>
7: #include <petsc/private/randomimpl.h>
8: #include <petsc/private/vecimpl.h>
9: #include <petsc/private/cupmobject.hpp>
10: #include <petsc/private/cupmsolverinterface.hpp>
12: #include <petsc/private/cpp/type_traits.hpp>
13: #include <petsc/private/cpp/utility.hpp>
15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
17: namespace Petsc
18: {
20: namespace mat
21: {
23: namespace cupm
24: {
26: namespace impl
27: {
29: template <device::cupm::DeviceType T>
30: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31: public:
32: MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
34: private:
35: struct Mat_SeqDenseCUPM {
36: PetscScalar *d_v; // pointer to the matrix on the GPU
37: PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38: bool d_user_alloc;
39: bool d_unplaced_user_alloc;
40: // factorization support
41: cupmBlasInt_t *d_fact_ipiv; // device pivots
42: cupmScalar_t *d_fact_tau; // device QR tau vector
43: cupmBlasInt_t *d_fact_info; // device info
44: cupmScalar_t *d_fact_work; // device workspace
45: cupmBlasInt_t d_fact_lwork; // size of device workspace
46: // workspace
47: Vec workvec;
48: };
50: static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
52: static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53: static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
55: static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
57: template <typename Derived>
58: struct SolveCommon;
59: struct SolveQR;
60: struct SolveCholesky;
61: struct SolveLU;
63: template <typename Solver, bool transpose>
64: static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65: template <typename Solver, bool transpose>
66: static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67: template <bool transpose, bool hermitian>
68: static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
69: template <bool transpose, bool hermitian>
70: static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
71: template <bool transpose, bool hermitian>
72: static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
74: template <bool to_host>
75: static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
77: PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept;
78: PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
80: public:
81: PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
83: // define these by hand since they don't fit the above mold
84: PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
85: PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
87: static PetscErrorCode Create(Mat) noexcept;
88: static PetscErrorCode Destroy(Mat) noexcept;
89: static PetscErrorCode SetUp(Mat) noexcept;
90: static PetscErrorCode Reset(Mat) noexcept;
92: static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
93: static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
94: static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
96: template <PetscMemType, PetscMemoryAccessMode>
97: static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98: template <PetscMemType, PetscMemoryAccessMode>
99: static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100: template <PetscMemoryAccessMode>
101: static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102: template <PetscMemoryAccessMode>
103: static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
105: private:
106: template <PetscMemType mtype, PetscMemoryAccessMode mode>
107: static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108: {
109: PetscDeviceContext dctx;
111: PetscFunctionBegin;
112: PetscCall(GetHandles_(&dctx));
113: PetscCall(GetArray<mtype, mode>(m, p, dctx));
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: template <PetscMemType mtype, PetscMemoryAccessMode mode>
118: static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119: {
120: PetscDeviceContext dctx;
122: PetscFunctionBegin;
123: PetscCall(GetHandles_(&dctx));
124: PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125: PetscFunctionReturn(PETSC_SUCCESS);
126: }
128: template <PetscMemoryAccessMode mode>
129: static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130: {
131: PetscDeviceContext dctx;
133: PetscFunctionBegin;
134: PetscCall(GetHandles_(&dctx));
135: PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: template <PetscMemoryAccessMode mode>
140: static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141: {
142: PetscDeviceContext dctx;
144: PetscFunctionBegin;
145: PetscCall(GetHandles_(&dctx));
146: PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: public:
151: static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152: static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153: static PetscErrorCode ResetArray(Mat) noexcept;
155: template <bool transpose_A, bool transpose_B>
156: static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157: static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158: static PetscErrorCode ZeroEntries(Mat) noexcept;
159: static PetscErrorCode Conjugate(Mat) noexcept;
160: static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
161: static PetscErrorCode DiagonalScale(Mat, Vec, Vec) noexcept;
162: static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
163: static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
164: static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
166: static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
167: template <PetscMemoryAccessMode>
168: static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
169: template <PetscMemoryAccessMode>
170: static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
172: static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
173: static PetscErrorCode InvertFactors(Mat) noexcept;
175: static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
176: static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
177: };
179: } // namespace impl
181: namespace
182: {
184: // Declare this here so that the functions below can make use of it
185: template <device::cupm::DeviceType T>
186: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
187: {
188: PetscFunctionBegin;
189: PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
190: PetscFunctionReturn(PETSC_SUCCESS);
191: }
193: } // anonymous namespace
195: namespace impl
196: {
198: // ==========================================================================================
199: // MatDense_Seq_CUPM - Private API - Utility
200: // ==========================================================================================
202: template <device::cupm::DeviceType T>
203: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
204: {
205: const auto mcu = MatCUPMCast(m);
206: const auto nrows = m->rmap->n;
207: const auto ncols = m->cmap->n;
208: auto &lda = MatIMPLCast(m)->lda;
209: cupmStream_t stream;
211: PetscFunctionBegin;
212: PetscCheckTypeName(m, MATSEQDENSECUPM());
214: PetscCall(checkCupmBlasIntCast(nrows));
215: PetscCall(checkCupmBlasIntCast(ncols));
216: PetscCall(GetHandlesFrom_(dctx, &stream));
217: if (lda <= 0) lda = nrows;
218: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
219: if (user_device_array) {
220: mcu->d_user_alloc = PETSC_TRUE;
221: mcu->d_v = user_device_array;
222: } else {
223: std::size_t size;
225: mcu->d_user_alloc = PETSC_FALSE;
226: size = lda * ncols;
227: PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
228: PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
229: }
230: m->offloadmask = PETSC_OFFLOAD_GPU;
231: PetscFunctionReturn(PETSC_SUCCESS);
232: }
234: template <device::cupm::DeviceType T>
235: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
236: {
237: const auto nrows = m->rmap->n;
238: const auto ncols = m->cmap->n;
239: const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
241: PetscFunctionBegin;
242: PetscCheckTypeName(m, MATSEQDENSECUPM());
243: if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
244: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
245: if (copy) {
246: const auto mcu = MatCUPMCast(m);
247: cupmStream_t stream;
249: // Allocate GPU memory if not present
250: if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
251: PetscCall(GetHandlesFrom_(dctx, &stream));
252: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
253: {
254: const auto mimpl = MatIMPLCast(m);
255: const auto lda = mimpl->lda;
256: const auto src = mimpl->v;
257: const auto dest = mcu->d_v;
259: if (lda > nrows) {
260: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
261: } else {
262: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
263: }
264: }
265: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
266: // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
267: m->offloadmask = PETSC_OFFLOAD_BOTH;
268: }
269: PetscFunctionReturn(PETSC_SUCCESS);
270: }
272: template <device::cupm::DeviceType T>
273: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
274: {
275: const auto nrows = m->rmap->n;
276: const auto ncols = m->cmap->n;
277: const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU;
279: PetscFunctionBegin;
280: PetscCheckTypeName(m, MATSEQDENSECUPM());
281: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
282: if (copy) {
283: const auto mimpl = MatIMPLCast(m);
284: cupmStream_t stream;
286: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
287: if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
288: PetscCall(GetHandlesFrom_(dctx, &stream));
289: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
290: {
291: const auto lda = mimpl->lda;
292: const auto dest = mimpl->v;
293: const auto src = MatCUPMCast(m)->d_v;
295: if (lda > nrows) {
296: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
297: } else {
298: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
299: }
300: }
301: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
302: // order is important, MatSeqDenseSetPreallocation() might set offloadmask
303: m->offloadmask = PETSC_OFFLOAD_BOTH;
304: }
305: PetscFunctionReturn(PETSC_SUCCESS);
306: }
308: template <device::cupm::DeviceType T>
309: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
310: {
311: PetscFunctionBegin;
312: if (PetscDefined(USE_DEBUG)) {
313: cupmBlasInt_t info = 0;
315: PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
316: if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
317: static_assert(std::is_same<decltype(info), int>::value, "");
318: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
319: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
320: }
321: PetscFunctionReturn(PETSC_SUCCESS);
322: }
324: // ==========================================================================================
325: // MatDense_Seq_CUPM - Private API - Solver Dispatch
326: // ==========================================================================================
328: // specific solvers called through the dispatch_() family of functions
329: template <device::cupm::DeviceType T>
330: template <typename Derived>
331: struct MatDense_Seq_CUPM<T>::SolveCommon {
332: using derived_type = Derived;
334: template <typename F>
335: static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
336: {
337: cupmBlasInt_t lwork;
339: PetscFunctionBegin;
340: PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
341: if (lwork > mcu->d_fact_lwork) {
342: mcu->d_fact_lwork = lwork;
343: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
344: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
345: }
346: PetscFunctionReturn(PETSC_SUCCESS);
347: }
349: static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
350: {
351: const auto mcu = MatCUPMCast(A);
353: PetscFunctionBegin;
354: PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
355: A->factortype = derived_type::MATFACTORTYPE();
356: A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>;
357: A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>;
358: A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>;
359: A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
361: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
362: if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
363: PetscFunctionReturn(PETSC_SUCCESS);
364: }
365: };
367: template <device::cupm::DeviceType T>
368: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
369: using base_type = SolveCommon<SolveLU>;
371: static constexpr const char *NAME() noexcept { return "LU"; }
372: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
374: static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
375: {
376: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
377: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
378: cupmStream_t stream;
379: cupmSolverHandle_t handle;
380: PetscDeviceContext dctx;
382: PetscFunctionBegin;
383: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
384: PetscCall(GetHandles_(&dctx, &handle, &stream));
385: PetscCall(base_type::FactorPrepare(A, stream));
386: {
387: const auto mcu = MatCUPMCast(A);
388: const auto da = DeviceArrayReadWrite(dctx, A);
389: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
391: // clang-format off
392: PetscCall(
393: base_type::ResizeFactLwork(
394: mcu, stream,
395: [&](cupmBlasInt_t *fact_lwork)
396: {
397: return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
398: }
399: )
400: );
401: // clang-format on
402: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
404: PetscCall(PetscLogGpuTimeBegin());
405: PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
406: PetscCall(PetscLogGpuTimeEnd());
407: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
408: }
409: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
410: PetscFunctionReturn(PETSC_SUCCESS);
411: }
413: template <bool transpose>
414: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
415: {
416: const auto mcu = MatCUPMCast(A);
417: const auto fact_info = mcu->d_fact_info;
418: const auto fact_ipiv = mcu->d_fact_ipiv;
419: cupmSolverHandle_t handle;
421: PetscFunctionBegin;
422: PetscCall(GetHandlesFrom_(dctx, &handle));
423: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
424: PetscCall(PetscLogGpuTimeBegin());
425: {
426: constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
427: const auto da = DeviceArrayRead(dctx, A);
428: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
430: // clang-format off
431: PetscCall(
432: base_type::ResizeFactLwork(
433: mcu, stream,
434: [&](cupmBlasInt_t *lwork)
435: {
436: return cupmSolverXgetrs_bufferSize(
437: handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
438: );
439: }
440: )
441: );
442: // clang-format on
443: PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
444: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
445: }
446: PetscCall(PetscLogGpuTimeEnd());
447: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
448: PetscFunctionReturn(PETSC_SUCCESS);
449: }
450: };
452: template <device::cupm::DeviceType T>
453: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
454: using base_type = SolveCommon<SolveCholesky>;
456: static constexpr const char *NAME() noexcept { return "Cholesky"; }
457: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
459: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
460: {
461: const auto n = static_cast<cupmBlasInt_t>(A->rmap->n);
462: PetscDeviceContext dctx;
463: cupmSolverHandle_t handle;
464: cupmStream_t stream;
466: PetscFunctionBegin;
467: if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
468: PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
469: PetscCall(GetHandles_(&dctx, &handle, &stream));
470: PetscCall(base_type::FactorPrepare(A, stream));
471: {
472: const auto mcu = MatCUPMCast(A);
473: const auto da = DeviceArrayReadWrite(dctx, A);
474: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
476: // clang-format off
477: PetscCall(
478: base_type::ResizeFactLwork(
479: mcu, stream,
480: [&](cupmBlasInt_t *fact_lwork)
481: {
482: return cupmSolverXpotrf_bufferSize(
483: handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
484: );
485: }
486: )
487: );
488: // clang-format on
489: PetscCall(PetscLogGpuTimeBegin());
490: PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
491: PetscCall(PetscLogGpuTimeEnd());
492: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
493: }
494: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
496: #if 0
497: // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
498: // and *hetr* routines. The code below should work, and it can be activated when *sytrs
499: // routines will be available
500: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
501: if (!mcu->d_fact_lwork) {
502: PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
503: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
504: }
505: if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
506: PetscCall(PetscLogGpuTimeBegin());
507: PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
508: PetscCall(PetscLogGpuTimeEnd());
509: #endif
510: PetscFunctionReturn(PETSC_SUCCESS);
511: }
513: template <bool transpose>
514: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
515: {
516: const auto mcu = MatCUPMCast(A);
517: const auto fact_info = mcu->d_fact_info;
518: cupmSolverHandle_t handle;
520: PetscFunctionBegin;
521: PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
522: PetscCall(GetHandlesFrom_(dctx, &handle));
523: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
524: PetscCall(PetscLogGpuTimeBegin());
525: {
526: const auto da = DeviceArrayRead(dctx, A);
527: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
529: // clang-format off
530: PetscCall(
531: base_type::ResizeFactLwork(
532: mcu, stream,
533: [&](cupmBlasInt_t *lwork)
534: {
535: return cupmSolverXpotrs_bufferSize(
536: handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
537: );
538: }
539: )
540: );
541: // clang-format on
542: PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
543: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
544: }
545: PetscCall(PetscLogGpuTimeEnd());
546: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
547: PetscFunctionReturn(PETSC_SUCCESS);
548: }
549: };
551: template <device::cupm::DeviceType T>
552: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
553: using base_type = SolveCommon<SolveQR>;
555: static constexpr const char *NAME() noexcept { return "QR"; }
556: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
558: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
559: {
560: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
561: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
562: const auto min = std::min(m, n);
563: const auto mimpl = MatIMPLCast(A);
564: cupmStream_t stream;
565: cupmSolverHandle_t handle;
566: PetscDeviceContext dctx;
568: PetscFunctionBegin;
569: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
570: PetscCall(GetHandles_(&dctx, &handle, &stream));
571: PetscCall(base_type::FactorPrepare(A, stream));
572: mimpl->rank = min;
573: {
574: const auto mcu = MatCUPMCast(A);
575: const auto da = DeviceArrayReadWrite(dctx, A);
576: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
578: if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
579: if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
580: // clang-format off
581: PetscCall(
582: base_type::ResizeFactLwork(
583: mcu, stream,
584: [&](cupmBlasInt_t *fact_lwork)
585: {
586: return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
587: }
588: )
589: );
590: // clang-format on
591: PetscCall(PetscLogGpuTimeBegin());
592: PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
593: PetscCall(PetscLogGpuTimeEnd());
594: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
595: }
596: PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
597: PetscFunctionReturn(PETSC_SUCCESS);
598: }
600: template <bool transpose>
601: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
602: {
603: const auto mimpl = MatIMPLCast(A);
604: const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank);
605: const auto mcu = MatCUPMCast(A);
606: const auto fact_info = mcu->d_fact_info;
607: const auto fact_tau = mcu->d_fact_tau;
608: const auto fact_work = mcu->d_fact_work;
609: const auto fact_lwork = mcu->d_fact_lwork;
610: cupmSolverHandle_t solver_handle;
611: cupmBlasHandle_t blas_handle;
613: PetscFunctionBegin;
614: PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
615: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
616: PetscCall(PetscLogGpuTimeBegin());
617: {
618: const auto da = DeviceArrayRead(dctx, A);
619: const auto one = cupmScalarCast(1.0);
620: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
622: if (transpose) {
623: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
624: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
625: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
626: } else {
627: constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
629: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
630: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
631: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
632: }
633: }
634: PetscCall(PetscLogGpuTimeEnd());
635: PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
636: PetscFunctionReturn(PETSC_SUCCESS);
637: }
638: };
640: template <device::cupm::DeviceType T>
641: template <typename Solver, bool transpose>
642: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
643: {
644: using namespace vec::cupm;
645: const auto pobj_A = PetscObjectCast(A);
646: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
647: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
648: auto &workvec = MatCUPMCast(A)->workvec;
649: PetscScalar *y_array = nullptr;
650: PetscDeviceContext dctx;
651: PetscBool xiscupm, yiscupm, aiscupm;
652: bool use_y_array_directly;
653: cupmStream_t stream;
655: PetscFunctionBegin;
656: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
657: PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
658: PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
659: PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
660: PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
661: PetscCall(GetHandles_(&dctx, &stream));
662: use_y_array_directly = yiscupm && (k >= m);
663: {
664: const PetscScalar *x_array;
665: const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
666: const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
668: if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
669: // The logic here is to try to minimize the amount of memory copying:
670: //
671: // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
672: // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
673: // data in order to copy it into the y array. So the array x will be wherever the data
674: // already is so that only one memcpy is performed
675: if (xisdevice) {
676: PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
677: } else {
678: PetscCall(VecGetArrayRead(x, &x_array));
679: }
680: PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
681: PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
682: if (xisdevice) {
683: PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
684: } else {
685: PetscCall(VecRestoreArrayRead(x, &x_array));
686: }
687: }
689: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
690: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
691: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
693: if (use_y_array_directly) {
694: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
695: } else {
696: const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
697: PetscScalar *yv;
699: // The logic here is that the data is not yet in either y's GPU array or its CPU array.
700: // There is nothing in the interface to say where the user would like it to end up. So we
701: // choose the GPU, because it is the faster option
702: if (yiscupm) {
703: PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
704: } else {
705: PetscCall(VecGetArray(y, &yv));
706: }
707: PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
708: if (yiscupm) {
709: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
710: } else {
711: PetscCall(VecRestoreArray(y, &yv));
712: }
713: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
714: }
715: PetscFunctionReturn(PETSC_SUCCESS);
716: }
718: template <device::cupm::DeviceType T>
719: template <typename Solver, bool transpose>
720: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
721: {
722: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
723: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
724: cupmBlasInt_t nrhs, ldb, ldx, ldy;
725: PetscScalar *y;
726: PetscBool biscupm, xiscupm, aiscupm;
727: PetscDeviceContext dctx;
728: cupmStream_t stream;
730: PetscFunctionBegin;
731: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
732: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
733: PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
734: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
735: PetscCall(GetHandles_(&dctx, &stream));
736: {
737: PetscInt n;
739: PetscCall(MatGetSize(B, nullptr, &n));
740: PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
741: PetscCall(MatDenseGetLDA(B, &n));
742: PetscCall(PetscCUPMBlasIntCast(n, &ldb));
743: PetscCall(MatDenseGetLDA(X, &n));
744: PetscCall(PetscCUPMBlasIntCast(n, &ldx));
745: }
746: {
747: // The logic here is to try to minimize the amount of memory copying:
748: //
749: // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
750: // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
751: // get the data in order to copy it into the y array. So the array b will be wherever the
752: // data already is so that only one memcpy is performed
753: const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
754: const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
755: const PetscScalar *b;
757: if (bisdevice) {
758: b = DeviceArrayRead(dctx, B);
759: } else if (biscupm) {
760: b = HostArrayRead(dctx, B);
761: } else {
762: PetscCall(MatDenseGetArrayRead(B, &b));
763: }
765: if (ldx < m || !xiscupm) {
766: // X's array cannot serve as the array (too small or not on device), B's array cannot
767: // serve as the array (const), so allocate a new array
768: ldy = m;
769: PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
770: } else {
771: // X's array should serve as the array
772: ldy = ldx;
773: y = DeviceArrayWrite(dctx, X);
774: }
775: PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
776: if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
777: }
779: // convert to CUPM twice??????????????????????????????????
780: // but A should already be CUPM??????????????????????????????????????
781: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
782: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
783: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
785: if (ldx < m || !xiscupm) {
786: const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
787: PetscScalar *x;
789: // The logic here is that the data is not yet in either X's GPU array or its CPU
790: // array. There is nothing in the interface to say where the user would like it to end up.
791: // So we choose the GPU, because it is the faster option
792: if (xiscupm) {
793: x = DeviceArrayWrite(dctx, X);
794: } else {
795: PetscCall(MatDenseGetArray(X, &x));
796: }
797: PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
798: if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
799: PetscCallCUPM(cupmFreeAsync(y, stream));
800: }
801: PetscFunctionReturn(PETSC_SUCCESS);
802: }
804: template <device::cupm::DeviceType T>
805: template <bool transpose, bool hermitian>
806: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
807: {
808: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
809: const auto n = static_cast<cupmBlasInt_t>(c_end - c_start);
810: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
811: PetscBool xiscupm, yiscupm, ziscupm;
812: cupmBlasHandle_t handle;
813: Vec x = xx, y = yy, z = zz;
814: PetscDeviceContext dctx;
816: PetscFunctionBegin;
817: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(xx), &xiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
818: if (!xiscupm || xx->boundtocpu) {
819: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(xx)), &x));
820: PetscCall(VecSetLayout(x, xx->map));
821: PetscCall(VecSetType(x, VecSeq_CUPM::VECCUPM()));
822: PetscCall(VecCopy(xx, x));
823: }
825: if (yy) {
826: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(yy), &yiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
827: if (!yiscupm || yy->boundtocpu) {
828: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(yy)), &y));
829: PetscCall(VecSetLayout(y, yy->map));
830: PetscCall(VecSetType(y, VecSeq_CUPM::VECCUPM()));
831: PetscCall(VecCopy(yy, y));
832: }
833: }
835: if (zz != yy) {
836: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(zz), &ziscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
837: if (!ziscupm || zz->boundtocpu) {
838: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(zz)), &z));
839: PetscCall(VecSetLayout(z, zz->map));
840: PetscCall(VecSetType(z, VecSeq_CUPM::VECCUPM()));
841: }
842: } else {
843: z = y;
844: }
846: if (y && y != z) PetscCall(VecSeq_CUPM::Copy(y, z)); // mult add
847: if (!m || !n) {
848: // mult only
849: if (!y) PetscCall(VecSeq_CUPM::Set(z, 0.0));
850: PetscFunctionReturn(PETSC_SUCCESS);
851: }
852: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
853: PetscCall(GetHandles_(&dctx, &handle));
854: {
855: constexpr auto op = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
856: const auto one = cupmScalarCast(1.0);
857: const auto zero = cupmScalarCast(0.0);
858: const auto da = DeviceArrayRead(dctx, A);
859: const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, x);
860: const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, z);
862: PetscCall(PetscLogGpuTimeBegin());
863: PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, y ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
864: PetscCall(PetscLogGpuTimeEnd());
865: }
866: PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
867: if (z != zz) {
868: PetscCall(VecCopy(z, zz));
869: if (z != y) PetscCall(VecDestroy(&z));
870: }
871: if (y != yy) PetscCall(VecDestroy(&y));
872: if (x != xx) PetscCall(VecDestroy(&x));
873: PetscFunctionReturn(PETSC_SUCCESS);
874: }
876: template <device::cupm::DeviceType T>
877: template <bool transpose, bool hermitian>
878: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
879: {
880: PetscFunctionBegin;
881: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
882: PetscFunctionReturn(PETSC_SUCCESS);
883: }
885: template <device::cupm::DeviceType T>
886: template <bool transpose, bool hermitian>
887: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
888: {
889: PetscFunctionBegin;
890: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
891: PetscFunctionReturn(PETSC_SUCCESS);
892: }
894: // ==========================================================================================
895: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
896: // ==========================================================================================
898: template <device::cupm::DeviceType T>
899: template <bool to_host>
900: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
901: {
902: PetscFunctionBegin;
903: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
904: // TODO these cases should be optimized
905: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
906: } else {
907: const auto B = *newmat;
908: const auto pobj = PetscObjectCast(B);
910: if (to_host) {
911: PetscCall(BindToCPU(B, PETSC_TRUE));
912: PetscCall(Reset(B));
913: } else {
914: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
915: }
917: PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
918: PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
919: // cvec might be the wrong VecType, destroy and rebuild it if necessary
920: // REVIEW ME: this is possibly very inefficient
921: PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
923: MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
924: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
925: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
926: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
927: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
928: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
929: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
930: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
931: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
932: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
933: MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
934: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
936: if (to_host) {
937: B->offloadmask = PETSC_OFFLOAD_CPU;
938: } else {
939: Mat_SeqDenseCUPM *mcu;
941: PetscCall(PetscNew(&mcu));
942: B->spptr = mcu;
943: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
944: PetscCall(BindToCPU(B, PETSC_FALSE));
945: }
947: MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
948: MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
949: }
950: PetscFunctionReturn(PETSC_SUCCESS);
951: }
953: // ==========================================================================================
954: // MatDense_Seq_CUPM - Public API
955: // ==========================================================================================
957: template <device::cupm::DeviceType T>
958: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
959: {
960: return MATSEQDENSECUPM();
961: }
963: template <device::cupm::DeviceType T>
964: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
965: {
966: return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
967: }
969: template <device::cupm::DeviceType T>
970: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
971: {
972: return static_cast<Mat_SeqDense *>(m->data);
973: }
975: template <device::cupm::DeviceType T>
976: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
977: {
978: return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
979: }
981: template <device::cupm::DeviceType T>
982: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
983: {
984: return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
985: }
987: // ==========================================================================================
989: // MatCreate_SeqDenseCUPM()
990: template <device::cupm::DeviceType T>
991: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
992: {
993: PetscFunctionBegin;
994: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
995: PetscCall(MatCreate_SeqDense(A));
996: PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
997: PetscFunctionReturn(PETSC_SUCCESS);
998: }
1000: template <device::cupm::DeviceType T>
1001: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
1002: {
1003: PetscFunctionBegin;
1004: // prevent copying back data if we own the data pointer
1005: if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1006: PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1007: PetscCall(MatDestroy_SeqDense(A));
1008: PetscFunctionReturn(PETSC_SUCCESS);
1009: }
1011: // obj->ops->setup()
1012: template <device::cupm::DeviceType T>
1013: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
1014: {
1015: PetscFunctionBegin;
1016: PetscCall(PetscLayoutSetUp(A->rmap));
1017: PetscCall(PetscLayoutSetUp(A->cmap));
1018: if (!A->preallocated) {
1019: PetscDeviceContext dctx;
1021: PetscCall(GetHandles_(&dctx));
1022: PetscCall(SetPreallocation(A, dctx, nullptr));
1023: }
1024: PetscFunctionReturn(PETSC_SUCCESS);
1025: }
1027: template <device::cupm::DeviceType T>
1028: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
1029: {
1030: PetscFunctionBegin;
1031: if (const auto mcu = MatCUPMCast(A)) {
1032: cupmStream_t stream;
1034: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1035: PetscCall(GetHandles_(&stream));
1036: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1037: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
1038: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1039: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1040: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1041: PetscCall(VecDestroy(&mcu->workvec));
1042: PetscCall(PetscFree(A->spptr /* mcu */));
1043: }
1044: PetscFunctionReturn(PETSC_SUCCESS);
1045: }
1047: // ==========================================================================================
1049: template <device::cupm::DeviceType T>
1050: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1051: {
1052: const auto mimpl = MatIMPLCast(A);
1053: const auto pobj = PetscObjectCast(A);
1055: PetscFunctionBegin;
1056: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1057: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1058: A->boundtocpu = to_host;
1059: PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1060: if (to_host) {
1061: PetscDeviceContext dctx;
1063: // make sure we have an up-to-date copy on the CPU
1064: PetscCall(GetHandles_(&dctx));
1065: PetscCall(DeviceToHost_(A, dctx));
1066: } else {
1067: PetscBool iscupm;
1069: if (auto &cvec = mimpl->cvec) {
1070: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1071: if (!iscupm) PetscCall(VecDestroy(&cvec));
1072: }
1073: if (auto &cmat = mimpl->cmat) {
1074: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1075: if (!iscupm) PetscCall(MatDestroy(&cmat));
1076: }
1077: }
1079: // ============================================================
1080: // Composed ops
1081: // ============================================================
1082: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1083: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1084: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1085: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1086: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1087: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1088: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1089: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1090: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1091: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1092: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1093: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1094: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1095: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1096: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1097: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1098: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1099: MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1100: MatComposeOp_CUPM(to_host, pobj, "MatMultColumnRange_C", MatMultColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1101: MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1102: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1103: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1104: // always the same
1105: PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1107: // ============================================================
1108: // Function pointer ops
1109: // ============================================================
1110: MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1111: MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1112: MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1113: MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1114: MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1115: MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1116: MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1117: MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1118: MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1119: MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1120: MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1121: MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1122: MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1123: MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1124: MatSetOp_CUPM(to_host, A, conjugate, MatConjugate_SeqDense, Conjugate);
1125: MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1126: MatSetOp_CUPM(to_host, A, diagonalscale, MatDiagonalScale_SeqDense, DiagonalScale);
1127: MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1128: MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1129: MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1130: MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1131: MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1132: MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1133: // seemingly always the same
1134: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1136: if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1137: PetscFunctionReturn(PETSC_SUCCESS);
1138: }
1140: template <device::cupm::DeviceType T>
1141: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1142: {
1143: PetscFunctionBegin;
1144: PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1145: PetscFunctionReturn(PETSC_SUCCESS);
1146: }
1148: template <device::cupm::DeviceType T>
1149: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1150: {
1151: PetscFunctionBegin;
1152: PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1153: PetscFunctionReturn(PETSC_SUCCESS);
1154: }
1156: // ==========================================================================================
1158: template <device::cupm::DeviceType T>
1159: template <PetscMemType mtype, PetscMemoryAccessMode access>
1160: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1161: {
1162: constexpr auto hostmem = PetscMemTypeHost(mtype);
1163: constexpr auto read_access = PetscMemoryAccessRead(access);
1165: PetscFunctionBegin;
1166: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1167: PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1168: if (hostmem) {
1169: if (read_access) {
1170: PetscCall(DeviceToHost_(m, dctx));
1171: } else if (!MatIMPLCast(m)->v) {
1172: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1173: PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1174: }
1175: *array = MatIMPLCast(m)->v;
1176: } else {
1177: if (read_access) {
1178: PetscCall(HostToDevice_(m, dctx));
1179: } else if (!MatCUPMCast(m)->d_v) {
1180: // write-only
1181: PetscCall(SetPreallocation(m, dctx, nullptr));
1182: }
1183: *array = MatCUPMCast(m)->d_v;
1184: }
1185: if (PetscMemoryAccessWrite(access)) {
1186: m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1187: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1188: }
1189: PetscFunctionReturn(PETSC_SUCCESS);
1190: }
1192: template <device::cupm::DeviceType T>
1193: template <PetscMemType mtype, PetscMemoryAccessMode access>
1194: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1195: {
1196: PetscFunctionBegin;
1197: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1198: if (PetscMemoryAccessWrite(access)) {
1199: // WRITE or READ_WRITE
1200: m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1201: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1202: }
1203: if (array) {
1204: PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1205: *array = nullptr;
1206: }
1207: PetscFunctionReturn(PETSC_SUCCESS);
1208: }
1210: template <device::cupm::DeviceType T>
1211: template <PetscMemoryAccessMode access>
1212: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1213: {
1214: PetscFunctionBegin;
1215: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1216: if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1217: PetscFunctionReturn(PETSC_SUCCESS);
1218: }
1220: template <device::cupm::DeviceType T>
1221: template <PetscMemoryAccessMode access>
1222: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1223: {
1224: PetscFunctionBegin;
1225: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1226: PetscFunctionReturn(PETSC_SUCCESS);
1227: }
1229: // ==========================================================================================
1231: template <device::cupm::DeviceType T>
1232: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1233: {
1234: const auto mimpl = MatIMPLCast(A);
1235: const auto mcu = MatCUPMCast(A);
1237: PetscFunctionBegin;
1238: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1239: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1240: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1241: if (mimpl->v) {
1242: PetscDeviceContext dctx;
1244: PetscCall(GetHandles_(&dctx));
1245: PetscCall(HostToDevice_(A, dctx));
1246: }
1247: mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1248: mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1249: PetscFunctionReturn(PETSC_SUCCESS);
1250: }
1252: template <device::cupm::DeviceType T>
1253: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1254: {
1255: const auto mimpl = MatIMPLCast(A);
1256: const auto mcu = MatCUPMCast(A);
1258: PetscFunctionBegin;
1259: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1260: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1261: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1262: if (!mcu->d_user_alloc) {
1263: cupmStream_t stream;
1265: PetscCall(GetHandles_(&stream));
1266: PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1267: }
1268: mcu->d_v = const_cast<PetscScalar *>(array);
1269: mcu->d_user_alloc = PETSC_FALSE;
1270: PetscFunctionReturn(PETSC_SUCCESS);
1271: }
1273: template <device::cupm::DeviceType T>
1274: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1275: {
1276: const auto mimpl = MatIMPLCast(A);
1277: const auto mcu = MatCUPMCast(A);
1279: PetscFunctionBegin;
1280: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1281: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1282: if (mimpl->v) {
1283: PetscDeviceContext dctx;
1285: PetscCall(GetHandles_(&dctx));
1286: PetscCall(HostToDevice_(A, dctx));
1287: }
1288: mcu->d_v = util::exchange(mcu->unplacedarray, nullptr);
1289: mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1290: PetscFunctionReturn(PETSC_SUCCESS);
1291: }
1293: // ==========================================================================================
1295: template <device::cupm::DeviceType T>
1296: template <bool transpose_A, bool transpose_B>
1297: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1298: {
1299: cupmBlasInt_t m, n, k;
1300: PetscBool Aiscupm, Biscupm;
1301: PetscDeviceContext dctx;
1302: cupmBlasHandle_t handle;
1304: PetscFunctionBegin;
1305: PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1306: PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1307: PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1308: if (!m || !n || !k) {
1309: PetscCall(ZeroEntries(C));
1310: PetscFunctionReturn(PETSC_SUCCESS);
1311: }
1313: // we may end up with SEQDENSE as one of the arguments
1314: // REVIEW ME: how? and why is it not B and C????????
1315: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1316: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1317: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1318: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1319: PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1320: PetscCall(GetHandles_(&dctx, &handle));
1322: PetscCall(PetscLogGpuTimeBegin());
1323: {
1324: const auto one = cupmScalarCast(1.0);
1325: const auto zero = cupmScalarCast(0.0);
1326: const auto da = DeviceArrayRead(dctx, A);
1327: const auto db = DeviceArrayRead(dctx, B);
1328: const auto dc = DeviceArrayWrite(dctx, C);
1329: PetscInt alda, blda, clda;
1331: PetscCall(MatDenseGetLDA(A, &alda));
1332: PetscCall(MatDenseGetLDA(B, &blda));
1333: PetscCall(MatDenseGetLDA(C, &clda));
1334: PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1335: }
1336: PetscCall(PetscLogGpuTimeEnd());
1338: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1339: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1340: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1341: PetscFunctionReturn(PETSC_SUCCESS);
1342: }
1344: template <device::cupm::DeviceType T>
1345: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1346: {
1347: const auto m = A->rmap->n;
1348: const auto n = A->cmap->n;
1350: PetscFunctionBegin;
1351: PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1352: // The two matrices must have the same copy implementation to be eligible for fast copy
1353: if (A->ops->copy == B->ops->copy) {
1354: PetscDeviceContext dctx;
1355: cupmStream_t stream;
1357: PetscCall(GetHandles_(&dctx, &stream));
1358: PetscCall(PetscLogGpuTimeBegin());
1359: {
1360: const auto va = DeviceArrayRead(dctx, A);
1361: const auto vb = DeviceArrayWrite(dctx, B);
1362: // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1363: // lda!
1364: const auto lda_a = MatIMPLCast(A)->lda;
1365: const auto lda_b = MatIMPLCast(B)->lda;
1367: if (lda_a > m || lda_b > m) {
1368: PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1369: PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1370: PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1371: } else {
1372: PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1373: }
1374: }
1375: PetscCall(PetscLogGpuTimeEnd());
1376: } else {
1377: PetscCall(MatCopy_Basic(A, B, str));
1378: }
1379: PetscFunctionReturn(PETSC_SUCCESS);
1380: }
1382: template <device::cupm::DeviceType T>
1383: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1384: {
1385: PetscDeviceContext dctx;
1386: cupmStream_t stream;
1388: PetscFunctionBegin;
1389: PetscCall(GetHandles_(&dctx, &stream));
1390: PetscCall(PetscLogGpuTimeBegin());
1391: {
1392: const auto va = DeviceArrayWrite(dctx, m);
1393: const auto lda = MatIMPLCast(m)->lda;
1394: const auto ma = m->rmap->n;
1395: const auto na = m->cmap->n;
1397: if (lda > ma) {
1398: PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1399: } else {
1400: PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1401: }
1402: }
1403: PetscCall(PetscLogGpuTimeEnd());
1404: PetscFunctionReturn(PETSC_SUCCESS);
1405: }
1407: namespace detail
1408: {
1410: // ==========================================================================================
1411: // SubMatIndexFunctor
1412: //
1413: // Iterator which permutes a linear index range into matrix indices for an nrows x ncols
1414: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1415: // the i'th sequential entry in the matrix.
1416: // ==========================================================================================
1417: template <typename T>
1418: struct SubMatIndexFunctor {
1419: PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1421: PetscInt nrows;
1422: PetscInt ncols;
1423: PetscInt lda;
1424: };
1426: template <typename Iterator>
1427: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>> {
1428: using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>>;
1430: using iterator = typename base_type::iterator;
1432: constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1433: base_type{
1434: std::move(first), std::move(last), {nrows, ncols, lda}
1435: }
1436: {
1437: }
1439: PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1440: };
1442: namespace
1443: {
1445: template <typename T>
1446: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1447: {
1448: const auto nrows = rend - rstart;
1449: const auto ncols = cend - cstart;
1450: const auto dptr = thrust::device_pointer_cast(ptr);
1452: return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1453: }
1455: } // namespace
1457: struct conjugate {
1458: PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL PetscScalar operator()(const PetscScalar &x) const noexcept { return PetscConj(x); }
1459: };
1461: } // namespace detail
1463: template <device::cupm::DeviceType T>
1464: inline PetscErrorCode MatDense_Seq_CUPM<T>::Conjugate(Mat A) noexcept
1465: {
1466: const auto m = A->rmap->n;
1467: const auto n = A->cmap->n;
1468: const auto N = m * n;
1469: PetscDeviceContext dctx;
1470: cupmStream_t stream;
1472: PetscFunctionBegin;
1473: if (PetscDefined(USE_COMPLEX)) {
1474: PetscCall(GetHandles_(&dctx, &stream));
1475: PetscCall(PetscLogGpuTimeBegin());
1476: {
1477: const auto da = DeviceArrayReadWrite(dctx, A);
1478: const auto lda = MatIMPLCast(A)->lda;
1479: cupmStream_t stream;
1480: PetscCall(GetHandlesFrom_(dctx, &stream));
1482: if (lda > m) {
1483: // clang-format off
1484: PetscCallThrust(
1485: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1487: THRUST_CALL(
1488: thrust::transform,
1489: stream,
1490: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1491: detail::conjugate{}
1492: )
1493: );
1494: // clang-format on
1495: } else {
1496: // clang-format off
1497: PetscCallThrust(
1498: const auto aptr = thrust::device_pointer_cast(da.data());
1500: THRUST_CALL(
1501: thrust::transform,
1502: stream,
1503: aptr, aptr + N, aptr,
1504: detail::conjugate{}
1505: )
1506: );
1507: // clang-format on
1508: }
1509: }
1510: PetscCall(PetscLogGpuTimeEnd());
1511: }
1512: PetscFunctionReturn(PETSC_SUCCESS);
1513: }
1515: template <device::cupm::DeviceType T>
1516: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1517: {
1518: const auto m = A->rmap->n;
1519: const auto n = A->cmap->n;
1520: const auto N = m * n;
1521: PetscDeviceContext dctx;
1523: PetscFunctionBegin;
1524: PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1525: PetscCall(GetHandles_(&dctx));
1526: {
1527: const auto da = DeviceArrayReadWrite(dctx, A);
1528: const auto lda = MatIMPLCast(A)->lda;
1530: if (lda > m) {
1531: cupmStream_t stream;
1533: PetscCall(GetHandlesFrom_(dctx, &stream));
1534: // clang-format off
1535: PetscCallThrust(
1536: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1538: THRUST_CALL(
1539: thrust::transform,
1540: stream,
1541: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1542: device::cupm::functors::make_times_equals(alpha)
1543: )
1544: );
1545: // clang-format on
1546: } else {
1547: const auto cu_alpha = cupmScalarCast(alpha);
1548: cupmBlasHandle_t handle;
1550: PetscCall(GetHandlesFrom_(dctx, &handle));
1551: PetscCall(PetscLogGpuTimeBegin());
1552: PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1553: PetscCall(PetscLogGpuTimeEnd());
1554: }
1555: }
1556: PetscCall(PetscLogGpuFlops(N));
1557: PetscFunctionReturn(PETSC_SUCCESS);
1558: }
1560: template <device::cupm::DeviceType T>
1561: inline PetscErrorCode MatDense_Seq_CUPM<T>::DiagonalScale(Mat A, Vec l, Vec r) noexcept
1562: {
1563: PetscBool iscupm;
1564: PetscDeviceContext dctx;
1565: cupmBlasHandle_t handle;
1566: auto m = A->rmap->n, n = A->cmap->n;
1568: PetscFunctionBegin;
1569: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
1570: PetscCall(PetscInfo(A, "Performing DiagonalScale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1571: PetscCall(GetHandles_(&dctx, &handle));
1572: {
1573: Vec lr;
1574: const auto da = DeviceArrayReadWrite(dctx, A);
1575: const auto lda = MatIMPLCast(A)->lda;
1577: if (l) {
1578: PetscCall(VecGetSize(l, &m));
1579: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling Vec of wrong size");
1580: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(l), &iscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1581: if (!iscupm || l->boundtocpu) {
1582: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(l)), &lr));
1583: PetscCall(VecSetLayout(lr, l->map));
1584: PetscCall(VecSetType(lr, VecSeq_CUPM::VECCUPM()));
1585: PetscCall(VecCopy(l, lr));
1586: } else lr = l;
1587: {
1588: const auto dlr = VecSeq_CUPM::DeviceArrayRead(dctx, lr);
1589: constexpr auto side = CUPMBLAS_SIDE_LEFT;
1591: PetscCall(PetscLogGpuTimeBegin());
1592: PetscCallCUPMBLAS(cupmBlasXdgmm(handle, side, m, n, da.cupmdata(), lda, dlr.cupmdata(), 1, da.cupmdata(), lda));
1593: PetscCall(PetscLogGpuTimeEnd());
1594: PetscCall(PetscLogGpuFlops(1.0 * n * m));
1595: }
1596: if (!iscupm || l->boundtocpu) PetscCall(VecDestroy(&lr));
1597: }
1598: if (r) {
1599: PetscCall(VecGetSize(r, &n));
1600: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling Vec of wrong size");
1601: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(r), &iscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1602: if (!iscupm || r->boundtocpu) {
1603: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(r)), &lr));
1604: PetscCall(VecSetLayout(lr, r->map));
1605: PetscCall(VecSetType(lr, VecSeq_CUPM::VECCUPM()));
1606: PetscCall(VecCopy(r, lr));
1607: } else lr = r;
1608: {
1609: const auto dlr = VecSeq_CUPM::DeviceArrayRead(dctx, lr);
1610: constexpr auto side = CUPMBLAS_SIDE_RIGHT;
1612: PetscCall(PetscLogGpuTimeBegin());
1613: PetscCallCUPMBLAS(cupmBlasXdgmm(handle, side, m, n, da.cupmdata(), lda, dlr.cupmdata(), 1, da.cupmdata(), lda));
1614: PetscCall(PetscLogGpuTimeEnd());
1615: PetscCall(PetscLogGpuFlops(1.0 * n * m));
1616: }
1617: if (!iscupm || r->boundtocpu) PetscCall(VecDestroy(&lr));
1618: }
1619: }
1620: PetscFunctionReturn(PETSC_SUCCESS);
1621: }
1623: template <device::cupm::DeviceType T>
1624: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1625: {
1626: const auto m_x = X->rmap->n, m_y = Y->rmap->n;
1627: const auto n_x = X->cmap->n, n_y = Y->cmap->n;
1628: const auto N = m_x * n_x;
1629: PetscDeviceContext dctx;
1631: PetscFunctionBegin;
1632: if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1633: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1634: PetscCall(GetHandles_(&dctx));
1635: {
1636: const auto dx = DeviceArrayRead(dctx, X);
1637: const auto dy = DeviceArrayReadWrite(dctx, Y);
1638: const auto lda_x = MatIMPLCast(X)->lda;
1639: const auto lda_y = MatIMPLCast(Y)->lda;
1641: if (lda_x > m_x || lda_y > m_x) {
1642: cupmStream_t stream;
1644: PetscCall(GetHandlesFrom_(dctx, &stream));
1645: // clang-format off
1646: PetscCallThrust(
1647: const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1648: const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1650: THRUST_CALL(
1651: thrust::transform,
1652: stream,
1653: sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1654: device::cupm::functors::make_axpy(alpha)
1655: );
1656: );
1657: // clang-format on
1658: } else {
1659: const auto cu_alpha = cupmScalarCast(alpha);
1660: cupmBlasHandle_t handle;
1662: PetscCall(GetHandlesFrom_(dctx, &handle));
1663: PetscCall(PetscLogGpuTimeBegin());
1664: PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1665: PetscCall(PetscLogGpuTimeEnd());
1666: }
1667: }
1668: PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1669: PetscFunctionReturn(PETSC_SUCCESS);
1670: }
1672: template <device::cupm::DeviceType T>
1673: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1674: {
1675: const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1676: PetscDeviceContext dctx;
1678: PetscFunctionBegin;
1679: PetscCall(GetHandles_(&dctx));
1680: // do not call SetPreallocation() yet, we call it afterwards??
1681: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1682: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1683: if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1684: // allocate memory if needed
1685: if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1686: PetscFunctionReturn(PETSC_SUCCESS);
1687: }
1689: template <device::cupm::DeviceType T>
1690: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1691: {
1692: PetscBool device_rand_is_rander48;
1693: PetscBool device = PETSC_FALSE;
1695: PetscFunctionBegin;
1696: // CUPMObject::PETSCDEVICERAD() is PETSCRANDER48 until PetscRandom is implemented for hiprand
1697: PetscCall(PetscStrncmp(PETSCDEVICERAND(), PETSCRANDER48, sizeof(PETSCRANDER48), &device_rand_is_rander48));
1698: if (!device_rand_is_rander48) PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1699: if (device) {
1700: const auto m = A->rmap->n;
1701: const auto n = A->cmap->n;
1702: PetscDeviceContext dctx;
1704: PetscCall(GetHandles_(&dctx));
1705: {
1706: const auto a = DeviceArrayWrite(dctx, A);
1707: PetscInt lda;
1709: PetscCall(MatDenseGetLDA(A, &lda));
1710: if (lda > m) {
1711: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1712: } else {
1713: PetscInt mn;
1715: PetscCall(PetscIntMultError(m, n, &mn));
1716: PetscCall(PetscRandomGetValues(rng, mn, a));
1717: }
1718: }
1719: } else {
1720: PetscCall(MatSetRandom_SeqDense(A, rng));
1721: }
1722: PetscFunctionReturn(PETSC_SUCCESS);
1723: }
1725: // ==========================================================================================
1727: template <device::cupm::DeviceType T>
1728: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1729: {
1730: const auto offloadmask = A->offloadmask;
1731: const auto n = A->rmap->n;
1732: const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1733: PetscBool viscupm;
1734: PetscDeviceContext dctx;
1735: cupmStream_t stream;
1737: PetscFunctionBegin;
1738: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1739: PetscCall(GetHandles_(&dctx, &stream));
1740: if (viscupm && !v->boundtocpu) {
1741: const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1743: // update device data
1744: if (PetscOffloadDevice(offloadmask)) {
1745: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1746: } else {
1747: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1748: }
1749: } else {
1750: PetscScalar *x;
1752: // update host data
1753: PetscCall(VecGetArrayWrite(v, &x));
1754: if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1755: PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1756: } else if (PetscOffloadDevice(offloadmask)) {
1757: PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1758: }
1759: PetscCall(VecRestoreArrayWrite(v, &x));
1760: }
1761: PetscFunctionReturn(PETSC_SUCCESS);
1762: }
1764: template <device::cupm::DeviceType T>
1765: template <PetscMemoryAccessMode access>
1766: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1767: {
1768: using namespace vec::cupm;
1769: const auto mimpl = MatIMPLCast(A);
1770: PetscDeviceContext dctx;
1772: PetscFunctionBegin;
1773: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1774: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1775: mimpl->vecinuse = col + 1;
1776: if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1777: PetscCall(GetHandles_(&dctx));
1778: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1779: PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1780: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1781: *v = mimpl->cvec;
1782: PetscFunctionReturn(PETSC_SUCCESS);
1783: }
1785: template <device::cupm::DeviceType T>
1786: template <PetscMemoryAccessMode access>
1787: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1788: {
1789: using namespace vec::cupm;
1790: const auto mimpl = MatIMPLCast(A);
1791: const auto cvec = mimpl->cvec;
1792: PetscDeviceContext dctx;
1794: PetscFunctionBegin;
1795: PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1796: PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1797: mimpl->vecinuse = 0;
1798: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1799: PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1800: PetscCall(GetHandles_(&dctx));
1801: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1802: if (v) *v = nullptr;
1803: PetscFunctionReturn(PETSC_SUCCESS);
1804: }
1806: // ==========================================================================================
1808: template <device::cupm::DeviceType T>
1809: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1810: {
1811: Mat fact = nullptr;
1812: PetscDeviceContext dctx;
1814: PetscFunctionBegin;
1815: PetscCall(GetHandles_(&dctx));
1816: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1817: fact->factortype = ftype;
1818: switch (ftype) {
1819: case MAT_FACTOR_LU:
1820: case MAT_FACTOR_ILU: // fall-through
1821: fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1822: fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1823: break;
1824: case MAT_FACTOR_CHOLESKY:
1825: case MAT_FACTOR_ICC: // fall-through
1826: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1827: break;
1828: case MAT_FACTOR_QR: {
1829: const auto pobj = PetscObjectCast(fact);
1831: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1832: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1833: } break;
1834: case MAT_FACTOR_NONE:
1835: case MAT_FACTOR_ILUDT: // fall-through
1836: case MAT_FACTOR_NUM_TYPES: // fall-through
1837: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1838: }
1839: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1840: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1841: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1842: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1843: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1844: *fact_out = fact;
1845: PetscFunctionReturn(PETSC_SUCCESS);
1846: }
1848: template <device::cupm::DeviceType T>
1849: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1850: {
1851: const auto mimpl = MatIMPLCast(A);
1852: const auto mcu = MatCUPMCast(A);
1853: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
1854: cupmSolverHandle_t handle;
1855: PetscDeviceContext dctx;
1856: cupmStream_t stream;
1858: PetscFunctionBegin;
1859: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1860: // HIP appears to have this by default??
1861: PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1862: #endif
1863: if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1864: PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1865: // spd
1866: PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1868: PetscCall(GetHandles_(&dctx, &handle, &stream));
1869: {
1870: const auto da = DeviceArrayReadWrite(dctx, A);
1871: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1872: cupmBlasInt_t il;
1874: PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1875: if (il > mcu->d_fact_lwork) {
1876: mcu->d_fact_lwork = il;
1877: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1878: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1879: }
1880: PetscCall(PetscLogGpuTimeBegin());
1881: PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1882: PetscCall(PetscLogGpuTimeEnd());
1883: }
1884: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1885: // TODO (write cuda kernel)
1886: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1887: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1889: A->ops->solve = nullptr;
1890: A->ops->solvetranspose = nullptr;
1891: A->ops->matsolve = nullptr;
1892: A->factortype = MAT_FACTOR_NONE;
1894: PetscCall(PetscFree(A->solvertype));
1895: PetscFunctionReturn(PETSC_SUCCESS);
1896: }
1898: // ==========================================================================================
1900: template <device::cupm::DeviceType T>
1901: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1902: {
1903: const auto mimpl = MatIMPLCast(A);
1904: const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1905: const auto n = rend - rbegin;
1906: const auto m = cend - cbegin;
1907: auto &cmat = mimpl->cmat;
1908: PetscDeviceContext dctx;
1910: PetscFunctionBegin;
1911: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1912: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1913: mimpl->matinuse = cbegin + 1;
1915: PetscCall(GetHandles_(&dctx));
1916: PetscCall(HostToDevice_(A, dctx));
1918: if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1919: {
1920: const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1922: if (cmat) {
1923: PetscCall(PlaceArray(cmat, device_array));
1924: } else {
1925: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1926: }
1927: }
1928: PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1929: // place CPU array if present but do not copy any data
1930: if (const auto host_array = mimpl->v) {
1931: cmat->offloadmask = PETSC_OFFLOAD_GPU;
1932: PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1933: }
1935: cmat->offloadmask = A->offloadmask;
1936: *mat = cmat;
1937: PetscFunctionReturn(PETSC_SUCCESS);
1938: }
1940: template <device::cupm::DeviceType T>
1941: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1942: {
1943: const auto mimpl = MatIMPLCast(A);
1944: const auto cmat = mimpl->cmat;
1945: const auto reset = static_cast<bool>(mimpl->v);
1946: bool copy, was_offload_host;
1948: PetscFunctionBegin;
1949: PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1950: PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1951: PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1952: mimpl->matinuse = 0;
1954: // calls to ResetArray may change it, so save it here
1955: was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1956: if (was_offload_host && !reset) {
1957: copy = true;
1958: PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1959: } else {
1960: copy = false;
1961: }
1963: PetscCall(ResetArray(cmat));
1964: if (reset) PetscCall(MatDenseResetArray(cmat));
1965: if (copy) {
1966: PetscDeviceContext dctx;
1968: PetscCall(GetHandles_(&dctx));
1969: PetscCall(DeviceToHost_(A, dctx));
1970: } else {
1971: A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1972: }
1974: cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1975: *m = nullptr;
1976: PetscFunctionReturn(PETSC_SUCCESS);
1977: }
1979: // ==========================================================================================
1981: namespace
1982: {
1984: template <device::cupm::DeviceType T>
1985: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1986: {
1987: PetscFunctionBegin;
1988: if (TA) {
1989: if (TB) {
1990: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1991: } else {
1992: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1993: }
1994: } else {
1995: if (TB) {
1996: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1997: } else {
1998: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1999: }
2000: }
2001: PetscFunctionReturn(PETSC_SUCCESS);
2002: }
2004: template <device::cupm::DeviceType T>
2005: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
2006: {
2007: PetscFunctionBegin;
2008: for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
2009: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
2010: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
2011: }
2012: PetscFunctionReturn(PETSC_SUCCESS);
2013: }
2015: } // anonymous namespace
2017: } // namespace impl
2019: } // namespace cupm
2021: } // namespace mat
2023: } // namespace Petsc