Actual source code: matseqdensecupm.hpp
1: #pragma once
3: #include <petsc/private/matdensecupmimpl.h>
4: #include <../src/mat/impls/dense/seq/dense.h>
6: #include <petsc/private/deviceimpl.h>
7: #include <petsc/private/randomimpl.h>
8: #include <petsc/private/vecimpl.h>
9: #include <petsc/private/cupmobject.hpp>
10: #include <petsc/private/cupmsolverinterface.hpp>
12: #include <petsc/private/cpp/type_traits.hpp>
13: #include <petsc/private/cpp/utility.hpp>
15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
17: namespace Petsc
18: {
20: namespace mat
21: {
23: namespace cupm
24: {
26: namespace impl
27: {
29: template <device::cupm::DeviceType T>
30: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31: public:
32: MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
34: private:
35: struct Mat_SeqDenseCUPM {
36: PetscScalar *d_v; // pointer to the matrix on the GPU
37: PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38: bool d_user_alloc;
39: bool d_unplaced_user_alloc;
40: // factorization support
41: cupmBlasInt_t *d_fact_ipiv; // device pivots
42: cupmScalar_t *d_fact_tau; // device QR tau vector
43: cupmBlasInt_t *d_fact_info; // device info
44: cupmScalar_t *d_fact_work; // device workspace
45: cupmBlasInt_t d_fact_lwork; // size of device workspace
46: // workspace
47: Vec workvec;
48: };
50: static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
52: static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53: static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
55: static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
57: template <typename Derived>
58: struct SolveCommon;
59: struct SolveQR;
60: struct SolveCholesky;
61: struct SolveLU;
63: template <typename Solver, bool transpose>
64: static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65: template <typename Solver, bool transpose>
66: static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67: template <bool transpose, bool hermitian>
68: static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
69: template <bool transpose, bool hermitian>
70: static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
71: template <bool transpose, bool hermitian>
72: static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
74: template <bool to_host>
75: static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
77: PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept;
78: PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
80: public:
81: PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
83: // define these by hand since they don't fit the above mold
84: PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
85: PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
87: static PetscErrorCode Create(Mat) noexcept;
88: static PetscErrorCode Destroy(Mat) noexcept;
89: static PetscErrorCode SetUp(Mat) noexcept;
90: static PetscErrorCode Reset(Mat) noexcept;
92: static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
93: static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
94: static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
96: template <PetscMemType, PetscMemoryAccessMode>
97: static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98: template <PetscMemType, PetscMemoryAccessMode>
99: static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100: template <PetscMemoryAccessMode>
101: static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102: template <PetscMemoryAccessMode>
103: static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
105: private:
106: template <PetscMemType mtype, PetscMemoryAccessMode mode>
107: static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108: {
109: PetscDeviceContext dctx;
111: PetscFunctionBegin;
112: PetscCall(GetHandles_(&dctx));
113: PetscCall(GetArray<mtype, mode>(m, p, dctx));
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: template <PetscMemType mtype, PetscMemoryAccessMode mode>
118: static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119: {
120: PetscDeviceContext dctx;
122: PetscFunctionBegin;
123: PetscCall(GetHandles_(&dctx));
124: PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125: PetscFunctionReturn(PETSC_SUCCESS);
126: }
128: template <PetscMemoryAccessMode mode>
129: static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130: {
131: PetscDeviceContext dctx;
133: PetscFunctionBegin;
134: PetscCall(GetHandles_(&dctx));
135: PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: template <PetscMemoryAccessMode mode>
140: static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141: {
142: PetscDeviceContext dctx;
144: PetscFunctionBegin;
145: PetscCall(GetHandles_(&dctx));
146: PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: public:
151: static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152: static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153: static PetscErrorCode ResetArray(Mat) noexcept;
155: template <bool transpose_A, bool transpose_B>
156: static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157: static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158: static PetscErrorCode ZeroEntries(Mat) noexcept;
159: static PetscErrorCode Conjugate(Mat) noexcept;
160: static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
161: static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
162: static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
163: static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
165: static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
166: template <PetscMemoryAccessMode>
167: static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
168: template <PetscMemoryAccessMode>
169: static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
171: static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
172: static PetscErrorCode InvertFactors(Mat) noexcept;
174: static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
175: static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
176: };
178: } // namespace impl
180: namespace
181: {
183: // Declare this here so that the functions below can make use of it
184: template <device::cupm::DeviceType T>
185: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
186: {
187: PetscFunctionBegin;
188: PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
189: PetscFunctionReturn(PETSC_SUCCESS);
190: }
192: } // anonymous namespace
194: namespace impl
195: {
197: // ==========================================================================================
198: // MatDense_Seq_CUPM - Private API - Utility
199: // ==========================================================================================
201: template <device::cupm::DeviceType T>
202: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
203: {
204: const auto mcu = MatCUPMCast(m);
205: const auto nrows = m->rmap->n;
206: const auto ncols = m->cmap->n;
207: auto &lda = MatIMPLCast(m)->lda;
208: cupmStream_t stream;
210: PetscFunctionBegin;
211: PetscCheckTypeName(m, MATSEQDENSECUPM());
213: PetscCall(checkCupmBlasIntCast(nrows));
214: PetscCall(checkCupmBlasIntCast(ncols));
215: PetscCall(GetHandlesFrom_(dctx, &stream));
216: if (lda <= 0) lda = nrows;
217: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
218: if (user_device_array) {
219: mcu->d_user_alloc = PETSC_TRUE;
220: mcu->d_v = user_device_array;
221: } else {
222: std::size_t size;
224: mcu->d_user_alloc = PETSC_FALSE;
225: size = lda * ncols;
226: PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
227: PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
228: }
229: m->offloadmask = PETSC_OFFLOAD_GPU;
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: template <device::cupm::DeviceType T>
234: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
235: {
236: const auto nrows = m->rmap->n;
237: const auto ncols = m->cmap->n;
238: const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
240: PetscFunctionBegin;
241: PetscCheckTypeName(m, MATSEQDENSECUPM());
242: if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
243: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
244: if (copy) {
245: const auto mcu = MatCUPMCast(m);
246: cupmStream_t stream;
248: // Allocate GPU memory if not present
249: if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
250: PetscCall(GetHandlesFrom_(dctx, &stream));
251: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
252: {
253: const auto mimpl = MatIMPLCast(m);
254: const auto lda = mimpl->lda;
255: const auto src = mimpl->v;
256: const auto dest = mcu->d_v;
258: if (lda > nrows) {
259: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
260: } else {
261: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
262: }
263: }
264: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
265: // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
266: m->offloadmask = PETSC_OFFLOAD_BOTH;
267: }
268: PetscFunctionReturn(PETSC_SUCCESS);
269: }
271: template <device::cupm::DeviceType T>
272: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
273: {
274: const auto nrows = m->rmap->n;
275: const auto ncols = m->cmap->n;
276: const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU;
278: PetscFunctionBegin;
279: PetscCheckTypeName(m, MATSEQDENSECUPM());
280: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
281: if (copy) {
282: const auto mimpl = MatIMPLCast(m);
283: cupmStream_t stream;
285: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
286: if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
287: PetscCall(GetHandlesFrom_(dctx, &stream));
288: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
289: {
290: const auto lda = mimpl->lda;
291: const auto dest = mimpl->v;
292: const auto src = MatCUPMCast(m)->d_v;
294: if (lda > nrows) {
295: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
296: } else {
297: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
298: }
299: }
300: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
301: // order is important, MatSeqDenseSetPreallocation() might set offloadmask
302: m->offloadmask = PETSC_OFFLOAD_BOTH;
303: }
304: PetscFunctionReturn(PETSC_SUCCESS);
305: }
307: template <device::cupm::DeviceType T>
308: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
309: {
310: PetscFunctionBegin;
311: if (PetscDefined(USE_DEBUG)) {
312: cupmBlasInt_t info = 0;
314: PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
315: if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
316: static_assert(std::is_same<decltype(info), int>::value, "");
317: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
318: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
319: }
320: PetscFunctionReturn(PETSC_SUCCESS);
321: }
323: // ==========================================================================================
324: // MatDense_Seq_CUPM - Private API - Solver Dispatch
325: // ==========================================================================================
327: // specific solvers called through the dispatch_() family of functions
328: template <device::cupm::DeviceType T>
329: template <typename Derived>
330: struct MatDense_Seq_CUPM<T>::SolveCommon {
331: using derived_type = Derived;
333: template <typename F>
334: static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
335: {
336: cupmBlasInt_t lwork;
338: PetscFunctionBegin;
339: PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
340: if (lwork > mcu->d_fact_lwork) {
341: mcu->d_fact_lwork = lwork;
342: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
343: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
344: }
345: PetscFunctionReturn(PETSC_SUCCESS);
346: }
348: static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
349: {
350: const auto mcu = MatCUPMCast(A);
352: PetscFunctionBegin;
353: PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
354: A->factortype = derived_type::MATFACTORTYPE();
355: A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>;
356: A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>;
357: A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>;
358: A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
360: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
361: if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
362: PetscFunctionReturn(PETSC_SUCCESS);
363: }
364: };
366: template <device::cupm::DeviceType T>
367: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
368: using base_type = SolveCommon<SolveLU>;
370: static constexpr const char *NAME() noexcept { return "LU"; }
371: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
373: static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
374: {
375: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
376: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
377: cupmStream_t stream;
378: cupmSolverHandle_t handle;
379: PetscDeviceContext dctx;
381: PetscFunctionBegin;
382: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
383: PetscCall(GetHandles_(&dctx, &handle, &stream));
384: PetscCall(base_type::FactorPrepare(A, stream));
385: {
386: const auto mcu = MatCUPMCast(A);
387: const auto da = DeviceArrayReadWrite(dctx, A);
388: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
390: // clang-format off
391: PetscCall(
392: base_type::ResizeFactLwork(
393: mcu, stream,
394: [&](cupmBlasInt_t *fact_lwork)
395: {
396: return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
397: }
398: )
399: );
400: // clang-format on
401: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
403: PetscCall(PetscLogGpuTimeBegin());
404: PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
405: PetscCall(PetscLogGpuTimeEnd());
406: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
407: }
408: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
409: PetscFunctionReturn(PETSC_SUCCESS);
410: }
412: template <bool transpose>
413: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
414: {
415: const auto mcu = MatCUPMCast(A);
416: const auto fact_info = mcu->d_fact_info;
417: const auto fact_ipiv = mcu->d_fact_ipiv;
418: cupmSolverHandle_t handle;
420: PetscFunctionBegin;
421: PetscCall(GetHandlesFrom_(dctx, &handle));
422: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
423: PetscCall(PetscLogGpuTimeBegin());
424: {
425: constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
426: const auto da = DeviceArrayRead(dctx, A);
427: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
429: // clang-format off
430: PetscCall(
431: base_type::ResizeFactLwork(
432: mcu, stream,
433: [&](cupmBlasInt_t *lwork)
434: {
435: return cupmSolverXgetrs_bufferSize(
436: handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
437: );
438: }
439: )
440: );
441: // clang-format on
442: PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
443: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
444: }
445: PetscCall(PetscLogGpuTimeEnd());
446: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
447: PetscFunctionReturn(PETSC_SUCCESS);
448: }
449: };
451: template <device::cupm::DeviceType T>
452: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
453: using base_type = SolveCommon<SolveCholesky>;
455: static constexpr const char *NAME() noexcept { return "Cholesky"; }
456: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
458: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
459: {
460: const auto n = static_cast<cupmBlasInt_t>(A->rmap->n);
461: PetscDeviceContext dctx;
462: cupmSolverHandle_t handle;
463: cupmStream_t stream;
465: PetscFunctionBegin;
466: if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
467: PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
468: PetscCall(GetHandles_(&dctx, &handle, &stream));
469: PetscCall(base_type::FactorPrepare(A, stream));
470: {
471: const auto mcu = MatCUPMCast(A);
472: const auto da = DeviceArrayReadWrite(dctx, A);
473: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
475: // clang-format off
476: PetscCall(
477: base_type::ResizeFactLwork(
478: mcu, stream,
479: [&](cupmBlasInt_t *fact_lwork)
480: {
481: return cupmSolverXpotrf_bufferSize(
482: handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
483: );
484: }
485: )
486: );
487: // clang-format on
488: PetscCall(PetscLogGpuTimeBegin());
489: PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
490: PetscCall(PetscLogGpuTimeEnd());
491: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
492: }
493: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
495: #if 0
496: // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
497: // and *hetr* routines. The code below should work, and it can be activated when *sytrs
498: // routines will be available
499: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
500: if (!mcu->d_fact_lwork) {
501: PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
502: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
503: }
504: if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
505: PetscCall(PetscLogGpuTimeBegin());
506: PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
507: PetscCall(PetscLogGpuTimeEnd());
508: #endif
509: PetscFunctionReturn(PETSC_SUCCESS);
510: }
512: template <bool transpose>
513: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
514: {
515: const auto mcu = MatCUPMCast(A);
516: const auto fact_info = mcu->d_fact_info;
517: cupmSolverHandle_t handle;
519: PetscFunctionBegin;
520: PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
521: PetscCall(GetHandlesFrom_(dctx, &handle));
522: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
523: PetscCall(PetscLogGpuTimeBegin());
524: {
525: const auto da = DeviceArrayRead(dctx, A);
526: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
528: // clang-format off
529: PetscCall(
530: base_type::ResizeFactLwork(
531: mcu, stream,
532: [&](cupmBlasInt_t *lwork)
533: {
534: return cupmSolverXpotrs_bufferSize(
535: handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
536: );
537: }
538: )
539: );
540: // clang-format on
541: PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
542: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
543: }
544: PetscCall(PetscLogGpuTimeEnd());
545: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
546: PetscFunctionReturn(PETSC_SUCCESS);
547: }
548: };
550: template <device::cupm::DeviceType T>
551: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
552: using base_type = SolveCommon<SolveQR>;
554: static constexpr const char *NAME() noexcept { return "QR"; }
555: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
557: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
558: {
559: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
560: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
561: const auto min = std::min(m, n);
562: const auto mimpl = MatIMPLCast(A);
563: cupmStream_t stream;
564: cupmSolverHandle_t handle;
565: PetscDeviceContext dctx;
567: PetscFunctionBegin;
568: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
569: PetscCall(GetHandles_(&dctx, &handle, &stream));
570: PetscCall(base_type::FactorPrepare(A, stream));
571: mimpl->rank = min;
572: {
573: const auto mcu = MatCUPMCast(A);
574: const auto da = DeviceArrayReadWrite(dctx, A);
575: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
577: if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
578: if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
579: // clang-format off
580: PetscCall(
581: base_type::ResizeFactLwork(
582: mcu, stream,
583: [&](cupmBlasInt_t *fact_lwork)
584: {
585: return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
586: }
587: )
588: );
589: // clang-format on
590: PetscCall(PetscLogGpuTimeBegin());
591: PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
592: PetscCall(PetscLogGpuTimeEnd());
593: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
594: }
595: PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
596: PetscFunctionReturn(PETSC_SUCCESS);
597: }
599: template <bool transpose>
600: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
601: {
602: const auto mimpl = MatIMPLCast(A);
603: const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank);
604: const auto mcu = MatCUPMCast(A);
605: const auto fact_info = mcu->d_fact_info;
606: const auto fact_tau = mcu->d_fact_tau;
607: const auto fact_work = mcu->d_fact_work;
608: const auto fact_lwork = mcu->d_fact_lwork;
609: cupmSolverHandle_t solver_handle;
610: cupmBlasHandle_t blas_handle;
612: PetscFunctionBegin;
613: PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
614: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
615: PetscCall(PetscLogGpuTimeBegin());
616: {
617: const auto da = DeviceArrayRead(dctx, A);
618: const auto one = cupmScalarCast(1.0);
619: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
621: if (transpose) {
622: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
623: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
624: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
625: } else {
626: constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
628: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
629: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
630: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
631: }
632: }
633: PetscCall(PetscLogGpuTimeEnd());
634: PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
635: PetscFunctionReturn(PETSC_SUCCESS);
636: }
637: };
639: template <device::cupm::DeviceType T>
640: template <typename Solver, bool transpose>
641: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
642: {
643: using namespace vec::cupm;
644: const auto pobj_A = PetscObjectCast(A);
645: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
646: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
647: auto &workvec = MatCUPMCast(A)->workvec;
648: PetscScalar *y_array = nullptr;
649: PetscDeviceContext dctx;
650: PetscBool xiscupm, yiscupm, aiscupm;
651: bool use_y_array_directly;
652: cupmStream_t stream;
654: PetscFunctionBegin;
655: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
656: PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
657: PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
658: PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
659: PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
660: PetscCall(GetHandles_(&dctx, &stream));
661: use_y_array_directly = yiscupm && (k >= m);
662: {
663: const PetscScalar *x_array;
664: const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
665: const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
667: if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
668: // The logic here is to try to minimize the amount of memory copying:
669: //
670: // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
671: // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
672: // data in order to copy it into the y array. So the array x will be wherever the data
673: // already is so that only one memcpy is performed
674: if (xisdevice) {
675: PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
676: } else {
677: PetscCall(VecGetArrayRead(x, &x_array));
678: }
679: PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
680: PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
681: if (xisdevice) {
682: PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
683: } else {
684: PetscCall(VecRestoreArrayRead(x, &x_array));
685: }
686: }
688: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
689: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
690: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
692: if (use_y_array_directly) {
693: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
694: } else {
695: const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
696: PetscScalar *yv;
698: // The logic here is that the data is not yet in either y's GPU array or its CPU array.
699: // There is nothing in the interface to say where the user would like it to end up. So we
700: // choose the GPU, because it is the faster option
701: if (yiscupm) {
702: PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
703: } else {
704: PetscCall(VecGetArray(y, &yv));
705: }
706: PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
707: if (yiscupm) {
708: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
709: } else {
710: PetscCall(VecRestoreArray(y, &yv));
711: }
712: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
713: }
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: template <device::cupm::DeviceType T>
718: template <typename Solver, bool transpose>
719: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
720: {
721: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
722: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
723: cupmBlasInt_t nrhs, ldb, ldx, ldy;
724: PetscScalar *y;
725: PetscBool biscupm, xiscupm, aiscupm;
726: PetscDeviceContext dctx;
727: cupmStream_t stream;
729: PetscFunctionBegin;
730: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
731: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
732: PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
733: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
734: PetscCall(GetHandles_(&dctx, &stream));
735: {
736: PetscInt n;
738: PetscCall(MatGetSize(B, nullptr, &n));
739: PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
740: PetscCall(MatDenseGetLDA(B, &n));
741: PetscCall(PetscCUPMBlasIntCast(n, &ldb));
742: PetscCall(MatDenseGetLDA(X, &n));
743: PetscCall(PetscCUPMBlasIntCast(n, &ldx));
744: }
745: {
746: // The logic here is to try to minimize the amount of memory copying:
747: //
748: // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
749: // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
750: // get the data in order to copy it into the y array. So the array b will be wherever the
751: // data already is so that only one memcpy is performed
752: const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
753: const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
754: const PetscScalar *b;
756: if (bisdevice) {
757: b = DeviceArrayRead(dctx, B);
758: } else if (biscupm) {
759: b = HostArrayRead(dctx, B);
760: } else {
761: PetscCall(MatDenseGetArrayRead(B, &b));
762: }
764: if (ldx < m || !xiscupm) {
765: // X's array cannot serve as the array (too small or not on device), B's array cannot
766: // serve as the array (const), so allocate a new array
767: ldy = m;
768: PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
769: } else {
770: // X's array should serve as the array
771: ldy = ldx;
772: y = DeviceArrayWrite(dctx, X);
773: }
774: PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
775: if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
776: }
778: // convert to CUPM twice??????????????????????????????????
779: // but A should already be CUPM??????????????????????????????????????
780: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
781: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
782: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
784: if (ldx < m || !xiscupm) {
785: const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
786: PetscScalar *x;
788: // The logic here is that the data is not yet in either X's GPU array or its CPU
789: // array. There is nothing in the interface to say where the user would like it to end up.
790: // So we choose the GPU, because it is the faster option
791: if (xiscupm) {
792: x = DeviceArrayWrite(dctx, X);
793: } else {
794: PetscCall(MatDenseGetArray(X, &x));
795: }
796: PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
797: if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
798: PetscCallCUPM(cupmFreeAsync(y, stream));
799: }
800: PetscFunctionReturn(PETSC_SUCCESS);
801: }
803: template <device::cupm::DeviceType T>
804: template <bool transpose, bool hermitian>
805: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
806: {
807: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
808: const auto n = static_cast<cupmBlasInt_t>(c_end - c_start);
809: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
810: PetscBool xiscupm, yiscupm, ziscupm;
811: cupmBlasHandle_t handle;
812: Vec x = xx, y = yy, z = zz;
813: PetscDeviceContext dctx;
815: PetscFunctionBegin;
816: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(xx), &xiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
817: if (!xiscupm || xx->boundtocpu) {
818: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(xx)), &x));
819: PetscCall(VecSetLayout(x, xx->map));
820: PetscCall(VecSetType(x, VecSeq_CUPM::VECCUPM()));
821: PetscCall(VecCopy(xx, x));
822: }
824: if (yy) {
825: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(yy), &yiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
826: if (!yiscupm || yy->boundtocpu) {
827: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(yy)), &y));
828: PetscCall(VecSetLayout(y, yy->map));
829: PetscCall(VecSetType(y, VecSeq_CUPM::VECCUPM()));
830: PetscCall(VecCopy(yy, y));
831: }
832: }
834: if (zz != yy) {
835: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(zz), &ziscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
836: if (!ziscupm || zz->boundtocpu) {
837: PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(zz)), &z));
838: PetscCall(VecSetLayout(z, zz->map));
839: PetscCall(VecSetType(z, VecSeq_CUPM::VECCUPM()));
840: }
841: } else {
842: z = y;
843: }
845: if (y && y != z) PetscCall(VecSeq_CUPM::Copy(y, z)); // mult add
846: if (!m || !n) {
847: // mult only
848: if (!y) PetscCall(VecSeq_CUPM::Set(z, 0.0));
849: PetscFunctionReturn(PETSC_SUCCESS);
850: }
851: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
852: PetscCall(GetHandles_(&dctx, &handle));
853: {
854: constexpr auto op = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
855: const auto one = cupmScalarCast(1.0);
856: const auto zero = cupmScalarCast(0.0);
857: const auto da = DeviceArrayRead(dctx, A);
858: const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, x);
859: const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, z);
861: PetscCall(PetscLogGpuTimeBegin());
862: PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, y ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
863: PetscCall(PetscLogGpuTimeEnd());
864: }
865: PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
866: if (z != zz) {
867: PetscCall(VecCopy(z, zz));
868: if (z != y) PetscCall(VecDestroy(&z));
869: }
870: if (y != yy) PetscCall(VecDestroy(&y));
871: if (x != xx) PetscCall(VecDestroy(&x));
872: PetscFunctionReturn(PETSC_SUCCESS);
873: }
875: template <device::cupm::DeviceType T>
876: template <bool transpose, bool hermitian>
877: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
878: {
879: PetscFunctionBegin;
880: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
881: PetscFunctionReturn(PETSC_SUCCESS);
882: }
884: template <device::cupm::DeviceType T>
885: template <bool transpose, bool hermitian>
886: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
887: {
888: PetscFunctionBegin;
889: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
890: PetscFunctionReturn(PETSC_SUCCESS);
891: }
893: // ==========================================================================================
894: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
895: // ==========================================================================================
897: template <device::cupm::DeviceType T>
898: template <bool to_host>
899: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
900: {
901: PetscFunctionBegin;
902: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
903: // TODO these cases should be optimized
904: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
905: } else {
906: const auto B = *newmat;
907: const auto pobj = PetscObjectCast(B);
909: if (to_host) {
910: PetscCall(BindToCPU(B, PETSC_TRUE));
911: PetscCall(Reset(B));
912: } else {
913: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
914: }
916: PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
917: PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
918: // cvec might be the wrong VecType, destroy and rebuild it if necessary
919: // REVIEW ME: this is possibly very inefficient
920: PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
922: MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
923: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
924: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
925: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
926: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
927: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
928: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
929: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
930: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
931: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
932: MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
933: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
935: if (to_host) {
936: B->offloadmask = PETSC_OFFLOAD_CPU;
937: } else {
938: Mat_SeqDenseCUPM *mcu;
940: PetscCall(PetscNew(&mcu));
941: B->spptr = mcu;
942: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
943: PetscCall(BindToCPU(B, PETSC_FALSE));
944: }
946: MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
947: MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
948: }
949: PetscFunctionReturn(PETSC_SUCCESS);
950: }
952: // ==========================================================================================
953: // MatDense_Seq_CUPM - Public API
954: // ==========================================================================================
956: template <device::cupm::DeviceType T>
957: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
958: {
959: return MATSEQDENSECUPM();
960: }
962: template <device::cupm::DeviceType T>
963: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
964: {
965: return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
966: }
968: template <device::cupm::DeviceType T>
969: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
970: {
971: return static_cast<Mat_SeqDense *>(m->data);
972: }
974: template <device::cupm::DeviceType T>
975: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
976: {
977: return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
978: }
980: template <device::cupm::DeviceType T>
981: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
982: {
983: return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
984: }
986: // ==========================================================================================
988: // MatCreate_SeqDenseCUPM()
989: template <device::cupm::DeviceType T>
990: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
991: {
992: PetscFunctionBegin;
993: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
994: PetscCall(MatCreate_SeqDense(A));
995: PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
996: PetscFunctionReturn(PETSC_SUCCESS);
997: }
999: template <device::cupm::DeviceType T>
1000: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
1001: {
1002: PetscFunctionBegin;
1003: // prevent copying back data if we own the data pointer
1004: if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1005: PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1006: PetscCall(MatDestroy_SeqDense(A));
1007: PetscFunctionReturn(PETSC_SUCCESS);
1008: }
1010: // obj->ops->setup()
1011: template <device::cupm::DeviceType T>
1012: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
1013: {
1014: PetscFunctionBegin;
1015: PetscCall(PetscLayoutSetUp(A->rmap));
1016: PetscCall(PetscLayoutSetUp(A->cmap));
1017: if (!A->preallocated) {
1018: PetscDeviceContext dctx;
1020: PetscCall(GetHandles_(&dctx));
1021: PetscCall(SetPreallocation(A, dctx, nullptr));
1022: }
1023: PetscFunctionReturn(PETSC_SUCCESS);
1024: }
1026: template <device::cupm::DeviceType T>
1027: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
1028: {
1029: PetscFunctionBegin;
1030: if (const auto mcu = MatCUPMCast(A)) {
1031: cupmStream_t stream;
1033: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1034: PetscCall(GetHandles_(&stream));
1035: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1036: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
1037: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1038: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1039: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1040: PetscCall(VecDestroy(&mcu->workvec));
1041: PetscCall(PetscFree(A->spptr /* mcu */));
1042: }
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: // ==========================================================================================
1048: template <device::cupm::DeviceType T>
1049: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1050: {
1051: const auto mimpl = MatIMPLCast(A);
1052: const auto pobj = PetscObjectCast(A);
1054: PetscFunctionBegin;
1055: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1056: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1057: A->boundtocpu = to_host;
1058: PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1059: if (to_host) {
1060: PetscDeviceContext dctx;
1062: // make sure we have an up-to-date copy on the CPU
1063: PetscCall(GetHandles_(&dctx));
1064: PetscCall(DeviceToHost_(A, dctx));
1065: } else {
1066: PetscBool iscupm;
1068: if (auto &cvec = mimpl->cvec) {
1069: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1070: if (!iscupm) PetscCall(VecDestroy(&cvec));
1071: }
1072: if (auto &cmat = mimpl->cmat) {
1073: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1074: if (!iscupm) PetscCall(MatDestroy(&cmat));
1075: }
1076: }
1078: // ============================================================
1079: // Composed ops
1080: // ============================================================
1081: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1082: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1083: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1084: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1085: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1086: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1087: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1088: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1089: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1090: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1091: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1092: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1093: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1094: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1095: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1096: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1097: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1098: MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1099: MatComposeOp_CUPM(to_host, pobj, "MatMultColumnRange_C", MatMultColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1100: MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1101: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1102: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1103: // always the same
1104: PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1106: // ============================================================
1107: // Function pointer ops
1108: // ============================================================
1109: MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1110: MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1111: MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1112: MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1113: MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1114: MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1115: MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1116: MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1117: MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1118: MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1119: MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1120: MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1121: MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1122: MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1123: MatSetOp_CUPM(to_host, A, conjugate, MatConjugate_SeqDense, Conjugate);
1124: MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1125: MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1126: MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1127: MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1128: MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1129: MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1130: MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1131: // seemingly always the same
1132: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1134: if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1135: PetscFunctionReturn(PETSC_SUCCESS);
1136: }
1138: template <device::cupm::DeviceType T>
1139: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1140: {
1141: PetscFunctionBegin;
1142: PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1143: PetscFunctionReturn(PETSC_SUCCESS);
1144: }
1146: template <device::cupm::DeviceType T>
1147: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1148: {
1149: PetscFunctionBegin;
1150: PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1151: PetscFunctionReturn(PETSC_SUCCESS);
1152: }
1154: // ==========================================================================================
1156: template <device::cupm::DeviceType T>
1157: template <PetscMemType mtype, PetscMemoryAccessMode access>
1158: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1159: {
1160: constexpr auto hostmem = PetscMemTypeHost(mtype);
1161: constexpr auto read_access = PetscMemoryAccessRead(access);
1163: PetscFunctionBegin;
1164: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1165: PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1166: if (hostmem) {
1167: if (read_access) {
1168: PetscCall(DeviceToHost_(m, dctx));
1169: } else if (!MatIMPLCast(m)->v) {
1170: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1171: PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1172: }
1173: *array = MatIMPLCast(m)->v;
1174: } else {
1175: if (read_access) {
1176: PetscCall(HostToDevice_(m, dctx));
1177: } else if (!MatCUPMCast(m)->d_v) {
1178: // write-only
1179: PetscCall(SetPreallocation(m, dctx, nullptr));
1180: }
1181: *array = MatCUPMCast(m)->d_v;
1182: }
1183: if (PetscMemoryAccessWrite(access)) {
1184: m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1185: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1186: }
1187: PetscFunctionReturn(PETSC_SUCCESS);
1188: }
1190: template <device::cupm::DeviceType T>
1191: template <PetscMemType mtype, PetscMemoryAccessMode access>
1192: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1193: {
1194: PetscFunctionBegin;
1195: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1196: if (PetscMemoryAccessWrite(access)) {
1197: // WRITE or READ_WRITE
1198: m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1199: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1200: }
1201: if (array) {
1202: PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1203: *array = nullptr;
1204: }
1205: PetscFunctionReturn(PETSC_SUCCESS);
1206: }
1208: template <device::cupm::DeviceType T>
1209: template <PetscMemoryAccessMode access>
1210: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1211: {
1212: PetscFunctionBegin;
1213: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1214: if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1215: PetscFunctionReturn(PETSC_SUCCESS);
1216: }
1218: template <device::cupm::DeviceType T>
1219: template <PetscMemoryAccessMode access>
1220: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1221: {
1222: PetscFunctionBegin;
1223: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1224: PetscFunctionReturn(PETSC_SUCCESS);
1225: }
1227: // ==========================================================================================
1229: template <device::cupm::DeviceType T>
1230: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1231: {
1232: const auto mimpl = MatIMPLCast(A);
1233: const auto mcu = MatCUPMCast(A);
1235: PetscFunctionBegin;
1236: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1237: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1238: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1239: if (mimpl->v) {
1240: PetscDeviceContext dctx;
1242: PetscCall(GetHandles_(&dctx));
1243: PetscCall(HostToDevice_(A, dctx));
1244: }
1245: mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1246: mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1247: PetscFunctionReturn(PETSC_SUCCESS);
1248: }
1250: template <device::cupm::DeviceType T>
1251: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1252: {
1253: const auto mimpl = MatIMPLCast(A);
1254: const auto mcu = MatCUPMCast(A);
1256: PetscFunctionBegin;
1257: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1258: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1259: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1260: if (!mcu->d_user_alloc) {
1261: cupmStream_t stream;
1263: PetscCall(GetHandles_(&stream));
1264: PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1265: }
1266: mcu->d_v = const_cast<PetscScalar *>(array);
1267: mcu->d_user_alloc = PETSC_FALSE;
1268: PetscFunctionReturn(PETSC_SUCCESS);
1269: }
1271: template <device::cupm::DeviceType T>
1272: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1273: {
1274: const auto mimpl = MatIMPLCast(A);
1275: const auto mcu = MatCUPMCast(A);
1277: PetscFunctionBegin;
1278: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1279: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1280: if (mimpl->v) {
1281: PetscDeviceContext dctx;
1283: PetscCall(GetHandles_(&dctx));
1284: PetscCall(HostToDevice_(A, dctx));
1285: }
1286: mcu->d_v = util::exchange(mcu->unplacedarray, nullptr);
1287: mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1288: PetscFunctionReturn(PETSC_SUCCESS);
1289: }
1291: // ==========================================================================================
1293: template <device::cupm::DeviceType T>
1294: template <bool transpose_A, bool transpose_B>
1295: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1296: {
1297: cupmBlasInt_t m, n, k;
1298: PetscBool Aiscupm, Biscupm;
1299: PetscDeviceContext dctx;
1300: cupmBlasHandle_t handle;
1302: PetscFunctionBegin;
1303: PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1304: PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1305: PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1306: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1308: // we may end up with SEQDENSE as one of the arguments
1309: // REVIEW ME: how? and why is it not B and C????????
1310: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1311: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1312: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1313: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1314: PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1315: PetscCall(GetHandles_(&dctx, &handle));
1317: PetscCall(PetscLogGpuTimeBegin());
1318: {
1319: const auto one = cupmScalarCast(1.0);
1320: const auto zero = cupmScalarCast(0.0);
1321: const auto da = DeviceArrayRead(dctx, A);
1322: const auto db = DeviceArrayRead(dctx, B);
1323: const auto dc = DeviceArrayWrite(dctx, C);
1324: PetscInt alda, blda, clda;
1326: PetscCall(MatDenseGetLDA(A, &alda));
1327: PetscCall(MatDenseGetLDA(B, &blda));
1328: PetscCall(MatDenseGetLDA(C, &clda));
1329: PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1330: }
1331: PetscCall(PetscLogGpuTimeEnd());
1333: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1334: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1335: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: template <device::cupm::DeviceType T>
1340: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1341: {
1342: const auto m = A->rmap->n;
1343: const auto n = A->cmap->n;
1345: PetscFunctionBegin;
1346: PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1347: // The two matrices must have the same copy implementation to be eligible for fast copy
1348: if (A->ops->copy == B->ops->copy) {
1349: PetscDeviceContext dctx;
1350: cupmStream_t stream;
1352: PetscCall(GetHandles_(&dctx, &stream));
1353: PetscCall(PetscLogGpuTimeBegin());
1354: {
1355: const auto va = DeviceArrayRead(dctx, A);
1356: const auto vb = DeviceArrayWrite(dctx, B);
1357: // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1358: // lda!
1359: const auto lda_a = MatIMPLCast(A)->lda;
1360: const auto lda_b = MatIMPLCast(B)->lda;
1362: if (lda_a > m || lda_b > m) {
1363: PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1364: PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1365: PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1366: } else {
1367: PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1368: }
1369: }
1370: PetscCall(PetscLogGpuTimeEnd());
1371: } else {
1372: PetscCall(MatCopy_Basic(A, B, str));
1373: }
1374: PetscFunctionReturn(PETSC_SUCCESS);
1375: }
1377: template <device::cupm::DeviceType T>
1378: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1379: {
1380: PetscDeviceContext dctx;
1381: cupmStream_t stream;
1383: PetscFunctionBegin;
1384: PetscCall(GetHandles_(&dctx, &stream));
1385: PetscCall(PetscLogGpuTimeBegin());
1386: {
1387: const auto va = DeviceArrayWrite(dctx, m);
1388: const auto lda = MatIMPLCast(m)->lda;
1389: const auto ma = m->rmap->n;
1390: const auto na = m->cmap->n;
1392: if (lda > ma) {
1393: PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1394: } else {
1395: PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1396: }
1397: }
1398: PetscCall(PetscLogGpuTimeEnd());
1399: PetscFunctionReturn(PETSC_SUCCESS);
1400: }
1402: namespace detail
1403: {
1405: // ==========================================================================================
1406: // SubMatIndexFunctor
1407: //
1408: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1409: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1410: // the i'th sequential entry in the matrix.
1411: // ==========================================================================================
1412: template <typename T>
1413: struct SubMatIndexFunctor {
1414: PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1416: PetscInt nrows;
1417: PetscInt ncols;
1418: PetscInt lda;
1419: };
1421: template <typename Iterator>
1422: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>> {
1423: using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>>;
1425: using iterator = typename base_type::iterator;
1427: constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1428: base_type{
1429: std::move(first), std::move(last), {nrows, ncols, lda}
1430: }
1431: {
1432: }
1434: PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1435: };
1437: namespace
1438: {
1440: template <typename T>
1441: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1442: {
1443: const auto nrows = rend - rstart;
1444: const auto ncols = cend - cstart;
1445: const auto dptr = thrust::device_pointer_cast(ptr);
1447: return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1448: }
1450: } // namespace
1452: struct conjugate {
1453: PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL PetscScalar operator()(const PetscScalar &x) const noexcept { return PetscConj(x); }
1454: };
1456: } // namespace detail
1458: template <device::cupm::DeviceType T>
1459: inline PetscErrorCode MatDense_Seq_CUPM<T>::Conjugate(Mat A) noexcept
1460: {
1461: const auto m = A->rmap->n;
1462: const auto n = A->cmap->n;
1463: const auto N = m * n;
1464: PetscDeviceContext dctx;
1465: cupmStream_t stream;
1467: PetscFunctionBegin;
1468: if (PetscDefined(USE_COMPLEX)) {
1469: PetscCall(GetHandles_(&dctx, &stream));
1470: PetscCall(PetscLogGpuTimeBegin());
1471: {
1472: const auto da = DeviceArrayReadWrite(dctx, A);
1473: const auto lda = MatIMPLCast(A)->lda;
1474: cupmStream_t stream;
1475: PetscCall(GetHandlesFrom_(dctx, &stream));
1477: if (lda > m) {
1478: // clang-format off
1479: PetscCallThrust(
1480: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1482: THRUST_CALL(
1483: thrust::transform,
1484: stream,
1485: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1486: detail::conjugate{}
1487: )
1488: );
1489: // clang-format on
1490: } else {
1491: // clang-format off
1492: PetscCallThrust(
1493: const auto aptr = thrust::device_pointer_cast(da.data());
1495: THRUST_CALL(
1496: thrust::transform,
1497: stream,
1498: aptr, aptr + N, aptr,
1499: detail::conjugate{}
1500: )
1501: );
1502: // clang-format on
1503: }
1504: }
1505: PetscCall(PetscLogGpuTimeEnd());
1506: }
1507: PetscFunctionReturn(PETSC_SUCCESS);
1508: }
1510: template <device::cupm::DeviceType T>
1511: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1512: {
1513: const auto m = A->rmap->n;
1514: const auto n = A->cmap->n;
1515: const auto N = m * n;
1516: PetscDeviceContext dctx;
1518: PetscFunctionBegin;
1519: PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1520: PetscCall(GetHandles_(&dctx));
1521: {
1522: const auto da = DeviceArrayReadWrite(dctx, A);
1523: const auto lda = MatIMPLCast(A)->lda;
1525: if (lda > m) {
1526: cupmStream_t stream;
1528: PetscCall(GetHandlesFrom_(dctx, &stream));
1529: // clang-format off
1530: PetscCallThrust(
1531: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1533: THRUST_CALL(
1534: thrust::transform,
1535: stream,
1536: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1537: device::cupm::functors::make_times_equals(alpha)
1538: )
1539: );
1540: // clang-format on
1541: } else {
1542: const auto cu_alpha = cupmScalarCast(alpha);
1543: cupmBlasHandle_t handle;
1545: PetscCall(GetHandlesFrom_(dctx, &handle));
1546: PetscCall(PetscLogGpuTimeBegin());
1547: PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1548: PetscCall(PetscLogGpuTimeEnd());
1549: }
1550: }
1551: PetscCall(PetscLogGpuFlops(N));
1552: PetscFunctionReturn(PETSC_SUCCESS);
1553: }
1555: template <device::cupm::DeviceType T>
1556: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1557: {
1558: const auto m_x = X->rmap->n, m_y = Y->rmap->n;
1559: const auto n_x = X->cmap->n, n_y = Y->cmap->n;
1560: const auto N = m_x * n_x;
1561: PetscDeviceContext dctx;
1563: PetscFunctionBegin;
1564: if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1565: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1566: PetscCall(GetHandles_(&dctx));
1567: {
1568: const auto dx = DeviceArrayRead(dctx, X);
1569: const auto dy = DeviceArrayReadWrite(dctx, Y);
1570: const auto lda_x = MatIMPLCast(X)->lda;
1571: const auto lda_y = MatIMPLCast(Y)->lda;
1573: if (lda_x > m_x || lda_y > m_x) {
1574: cupmStream_t stream;
1576: PetscCall(GetHandlesFrom_(dctx, &stream));
1577: // clang-format off
1578: PetscCallThrust(
1579: const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1580: const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1582: THRUST_CALL(
1583: thrust::transform,
1584: stream,
1585: sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1586: device::cupm::functors::make_axpy(alpha)
1587: );
1588: );
1589: // clang-format on
1590: } else {
1591: const auto cu_alpha = cupmScalarCast(alpha);
1592: cupmBlasHandle_t handle;
1594: PetscCall(GetHandlesFrom_(dctx, &handle));
1595: PetscCall(PetscLogGpuTimeBegin());
1596: PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1597: PetscCall(PetscLogGpuTimeEnd());
1598: }
1599: }
1600: PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1601: PetscFunctionReturn(PETSC_SUCCESS);
1602: }
1604: template <device::cupm::DeviceType T>
1605: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1606: {
1607: const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1608: PetscDeviceContext dctx;
1610: PetscFunctionBegin;
1611: PetscCall(GetHandles_(&dctx));
1612: // do not call SetPreallocation() yet, we call it afterwards??
1613: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1614: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1615: if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1616: // allocate memory if needed
1617: if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1618: PetscFunctionReturn(PETSC_SUCCESS);
1619: }
1621: template <device::cupm::DeviceType T>
1622: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1623: {
1624: PetscBool device_rand_is_rander48;
1625: PetscBool device = PETSC_FALSE;
1627: PetscFunctionBegin;
1628: // CUPMObject::PETSCDEVICERAD() is PETSCRANDER48 until PetscRandom is implemented for hiprand
1629: PetscCall(PetscStrncmp(PETSCDEVICERAND(), PETSCRANDER48, sizeof(PETSCRANDER48), &device_rand_is_rander48));
1630: if (!device_rand_is_rander48) PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1631: if (device) {
1632: const auto m = A->rmap->n;
1633: const auto n = A->cmap->n;
1634: PetscDeviceContext dctx;
1636: PetscCall(GetHandles_(&dctx));
1637: {
1638: const auto a = DeviceArrayWrite(dctx, A);
1639: PetscInt lda;
1641: PetscCall(MatDenseGetLDA(A, &lda));
1642: if (lda > m) {
1643: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1644: } else {
1645: PetscInt mn;
1647: PetscCall(PetscIntMultError(m, n, &mn));
1648: PetscCall(PetscRandomGetValues(rng, mn, a));
1649: }
1650: }
1651: } else {
1652: PetscCall(MatSetRandom_SeqDense(A, rng));
1653: }
1654: PetscFunctionReturn(PETSC_SUCCESS);
1655: }
1657: // ==========================================================================================
1659: template <device::cupm::DeviceType T>
1660: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1661: {
1662: const auto offloadmask = A->offloadmask;
1663: const auto n = A->rmap->n;
1664: const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1665: PetscBool viscupm;
1666: PetscDeviceContext dctx;
1667: cupmStream_t stream;
1669: PetscFunctionBegin;
1670: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1671: PetscCall(GetHandles_(&dctx, &stream));
1672: if (viscupm && !v->boundtocpu) {
1673: const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1675: // update device data
1676: if (PetscOffloadDevice(offloadmask)) {
1677: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1678: } else {
1679: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1680: }
1681: } else {
1682: PetscScalar *x;
1684: // update host data
1685: PetscCall(VecGetArrayWrite(v, &x));
1686: if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1687: PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1688: } else if (PetscOffloadDevice(offloadmask)) {
1689: PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1690: }
1691: PetscCall(VecRestoreArrayWrite(v, &x));
1692: }
1693: PetscFunctionReturn(PETSC_SUCCESS);
1694: }
1696: template <device::cupm::DeviceType T>
1697: template <PetscMemoryAccessMode access>
1698: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1699: {
1700: using namespace vec::cupm;
1701: const auto mimpl = MatIMPLCast(A);
1702: PetscDeviceContext dctx;
1704: PetscFunctionBegin;
1705: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1706: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1707: mimpl->vecinuse = col + 1;
1708: if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1709: PetscCall(GetHandles_(&dctx));
1710: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1711: PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1712: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1713: *v = mimpl->cvec;
1714: PetscFunctionReturn(PETSC_SUCCESS);
1715: }
1717: template <device::cupm::DeviceType T>
1718: template <PetscMemoryAccessMode access>
1719: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1720: {
1721: using namespace vec::cupm;
1722: const auto mimpl = MatIMPLCast(A);
1723: const auto cvec = mimpl->cvec;
1724: PetscDeviceContext dctx;
1726: PetscFunctionBegin;
1727: PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1728: PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1729: mimpl->vecinuse = 0;
1730: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1731: PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1732: PetscCall(GetHandles_(&dctx));
1733: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1734: if (v) *v = nullptr;
1735: PetscFunctionReturn(PETSC_SUCCESS);
1736: }
1738: // ==========================================================================================
1740: template <device::cupm::DeviceType T>
1741: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1742: {
1743: Mat fact = nullptr;
1744: PetscDeviceContext dctx;
1746: PetscFunctionBegin;
1747: PetscCall(GetHandles_(&dctx));
1748: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1749: fact->factortype = ftype;
1750: switch (ftype) {
1751: case MAT_FACTOR_LU:
1752: case MAT_FACTOR_ILU: // fall-through
1753: fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1754: fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1755: break;
1756: case MAT_FACTOR_CHOLESKY:
1757: case MAT_FACTOR_ICC: // fall-through
1758: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1759: break;
1760: case MAT_FACTOR_QR: {
1761: const auto pobj = PetscObjectCast(fact);
1763: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1764: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1765: } break;
1766: case MAT_FACTOR_NONE:
1767: case MAT_FACTOR_ILUDT: // fall-through
1768: case MAT_FACTOR_NUM_TYPES: // fall-through
1769: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1770: }
1771: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1772: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1773: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1774: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1775: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1776: *fact_out = fact;
1777: PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1780: template <device::cupm::DeviceType T>
1781: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1782: {
1783: const auto mimpl = MatIMPLCast(A);
1784: const auto mcu = MatCUPMCast(A);
1785: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
1786: cupmSolverHandle_t handle;
1787: PetscDeviceContext dctx;
1788: cupmStream_t stream;
1790: PetscFunctionBegin;
1791: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1792: // HIP appears to have this by default??
1793: PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1794: #endif
1795: if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1796: PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1797: // spd
1798: PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1800: PetscCall(GetHandles_(&dctx, &handle, &stream));
1801: {
1802: const auto da = DeviceArrayReadWrite(dctx, A);
1803: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1804: cupmBlasInt_t il;
1806: PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1807: if (il > mcu->d_fact_lwork) {
1808: mcu->d_fact_lwork = il;
1809: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1810: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1811: }
1812: PetscCall(PetscLogGpuTimeBegin());
1813: PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1814: PetscCall(PetscLogGpuTimeEnd());
1815: }
1816: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1817: // TODO (write cuda kernel)
1818: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1819: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1821: A->ops->solve = nullptr;
1822: A->ops->solvetranspose = nullptr;
1823: A->ops->matsolve = nullptr;
1824: A->factortype = MAT_FACTOR_NONE;
1826: PetscCall(PetscFree(A->solvertype));
1827: PetscFunctionReturn(PETSC_SUCCESS);
1828: }
1830: // ==========================================================================================
1832: template <device::cupm::DeviceType T>
1833: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1834: {
1835: const auto mimpl = MatIMPLCast(A);
1836: const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1837: const auto n = rend - rbegin;
1838: const auto m = cend - cbegin;
1839: auto &cmat = mimpl->cmat;
1840: PetscDeviceContext dctx;
1842: PetscFunctionBegin;
1843: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1844: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1845: mimpl->matinuse = cbegin + 1;
1847: PetscCall(GetHandles_(&dctx));
1848: PetscCall(HostToDevice_(A, dctx));
1850: if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1851: {
1852: const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1854: if (cmat) {
1855: PetscCall(PlaceArray(cmat, device_array));
1856: } else {
1857: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1858: }
1859: }
1860: PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1861: // place CPU array if present but do not copy any data
1862: if (const auto host_array = mimpl->v) {
1863: cmat->offloadmask = PETSC_OFFLOAD_GPU;
1864: PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1865: }
1867: cmat->offloadmask = A->offloadmask;
1868: *mat = cmat;
1869: PetscFunctionReturn(PETSC_SUCCESS);
1870: }
1872: template <device::cupm::DeviceType T>
1873: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1874: {
1875: const auto mimpl = MatIMPLCast(A);
1876: const auto cmat = mimpl->cmat;
1877: const auto reset = static_cast<bool>(mimpl->v);
1878: bool copy, was_offload_host;
1880: PetscFunctionBegin;
1881: PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1882: PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1883: PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1884: mimpl->matinuse = 0;
1886: // calls to ResetArray may change it, so save it here
1887: was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1888: if (was_offload_host && !reset) {
1889: copy = true;
1890: PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1891: } else {
1892: copy = false;
1893: }
1895: PetscCall(ResetArray(cmat));
1896: if (reset) PetscCall(MatDenseResetArray(cmat));
1897: if (copy) {
1898: PetscDeviceContext dctx;
1900: PetscCall(GetHandles_(&dctx));
1901: PetscCall(DeviceToHost_(A, dctx));
1902: } else {
1903: A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1904: }
1906: cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1907: *m = nullptr;
1908: PetscFunctionReturn(PETSC_SUCCESS);
1909: }
1911: // ==========================================================================================
1913: namespace
1914: {
1916: template <device::cupm::DeviceType T>
1917: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1918: {
1919: PetscFunctionBegin;
1920: if (TA) {
1921: if (TB) {
1922: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1923: } else {
1924: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1925: }
1926: } else {
1927: if (TB) {
1928: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1929: } else {
1930: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1931: }
1932: }
1933: PetscFunctionReturn(PETSC_SUCCESS);
1934: }
1936: template <device::cupm::DeviceType T>
1937: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1938: {
1939: PetscFunctionBegin;
1940: for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1941: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1942: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1943: }
1944: PetscFunctionReturn(PETSC_SUCCESS);
1945: }
1947: } // anonymous namespace
1949: } // namespace impl
1951: } // namespace cupm
1953: } // namespace mat
1955: } // namespace Petsc