Actual source code: matseqdensecupm.hpp

  1: #pragma once

  3: #include <petsc/private/matdensecupmimpl.h>
  4: #include <../src/mat/impls/dense/seq/dense.h>

  6: #include <petsc/private/deviceimpl.h>
  7: #include <petsc/private/randomimpl.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <petsc/private/cupmobject.hpp>
 10: #include <petsc/private/cupmsolverinterface.hpp>

 12: #include <petsc/private/cpp/type_traits.hpp>
 13: #include <petsc/private/cpp/utility.hpp>

 15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>

 17: namespace Petsc
 18: {

 20: namespace mat
 21: {

 23: namespace cupm
 24: {

 26: namespace impl
 27: {

 29: template <device::cupm::DeviceType T>
 30: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
 31: public:
 32:   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);

 34: private:
 35:   struct Mat_SeqDenseCUPM {
 36:     PetscScalar *d_v;           // pointer to the matrix on the GPU
 37:     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
 38:     bool         d_user_alloc;
 39:     bool         d_unplaced_user_alloc;
 40:     // factorization support
 41:     cupmBlasInt_t *d_fact_ipiv;  // device pivots
 42:     cupmScalar_t  *d_fact_tau;   // device QR tau vector
 43:     cupmBlasInt_t *d_fact_info;  // device info
 44:     cupmScalar_t  *d_fact_work;  // device workspace
 45:     cupmBlasInt_t  d_fact_lwork; // size of device workspace
 46:     // workspace
 47:     Vec workvec;
 48:   };

 50:   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;

 52:   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
 53:   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;

 55:   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;

 57:   template <typename Derived>
 58:   struct SolveCommon;
 59:   struct SolveQR;
 60:   struct SolveCholesky;
 61:   struct SolveLU;

 63:   template <typename Solver, bool transpose>
 64:   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
 65:   template <typename Solver, bool transpose>
 66:   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
 67:   template <bool transpose, bool hermitian>
 68:   static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
 69:   template <bool transpose, bool hermitian>
 70:   static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
 71:   template <bool transpose, bool hermitian>
 72:   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;

 74:   template <bool to_host>
 75:   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;

 77:   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
 78:   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;

 80: public:
 81:   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;

 83:   // define these by hand since they don't fit the above mold
 84:   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
 85:   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;

 87:   static PetscErrorCode Create(Mat) noexcept;
 88:   static PetscErrorCode Destroy(Mat) noexcept;
 89:   static PetscErrorCode SetUp(Mat) noexcept;
 90:   static PetscErrorCode Reset(Mat) noexcept;

 92:   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
 93:   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
 94:   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;

 96:   template <PetscMemType, PetscMemoryAccessMode>
 97:   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 98:   template <PetscMemType, PetscMemoryAccessMode>
 99:   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100:   template <PetscMemoryAccessMode>
101:   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102:   template <PetscMemoryAccessMode>
103:   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;

105: private:
106:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
107:   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108:   {
109:     PetscDeviceContext dctx;

111:     PetscFunctionBegin;
112:     PetscCall(GetHandles_(&dctx));
113:     PetscCall(GetArray<mtype, mode>(m, p, dctx));
114:     PetscFunctionReturn(PETSC_SUCCESS);
115:   }

117:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
118:   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119:   {
120:     PetscDeviceContext dctx;

122:     PetscFunctionBegin;
123:     PetscCall(GetHandles_(&dctx));
124:     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125:     PetscFunctionReturn(PETSC_SUCCESS);
126:   }

128:   template <PetscMemoryAccessMode mode>
129:   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130:   {
131:     PetscDeviceContext dctx;

133:     PetscFunctionBegin;
134:     PetscCall(GetHandles_(&dctx));
135:     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136:     PetscFunctionReturn(PETSC_SUCCESS);
137:   }

139:   template <PetscMemoryAccessMode mode>
140:   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141:   {
142:     PetscDeviceContext dctx;

144:     PetscFunctionBegin;
145:     PetscCall(GetHandles_(&dctx));
146:     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147:     PetscFunctionReturn(PETSC_SUCCESS);
148:   }

150: public:
151:   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152:   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153:   static PetscErrorCode ResetArray(Mat) noexcept;

155:   template <bool transpose_A, bool transpose_B>
156:   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157:   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158:   static PetscErrorCode ZeroEntries(Mat) noexcept;
159:   static PetscErrorCode Conjugate(Mat) noexcept;
160:   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
161:   static PetscErrorCode DiagonalScale(Mat, Vec, Vec) noexcept;
162:   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
163:   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
164:   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;

166:   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
167:   template <PetscMemoryAccessMode>
168:   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
169:   template <PetscMemoryAccessMode>
170:   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;

172:   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
173:   static PetscErrorCode InvertFactors(Mat) noexcept;

175:   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
176:   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
177: };

179: } // namespace impl

181: namespace
182: {

184: // Declare this here so that the functions below can make use of it
185: template <device::cupm::DeviceType T>
186: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
187: {
188:   PetscFunctionBegin;
189:   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
190:   PetscFunctionReturn(PETSC_SUCCESS);
191: }

193: } // anonymous namespace

195: namespace impl
196: {

198: // ==========================================================================================
199: // MatDense_Seq_CUPM - Private API - Utility
200: // ==========================================================================================

202: template <device::cupm::DeviceType T>
203: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
204: {
205:   const auto   mcu   = MatCUPMCast(m);
206:   const auto   nrows = m->rmap->n;
207:   const auto   ncols = m->cmap->n;
208:   auto        &lda   = MatIMPLCast(m)->lda;
209:   cupmStream_t stream;

211:   PetscFunctionBegin;
212:   PetscCheckTypeName(m, MATSEQDENSECUPM());
214:   PetscCall(checkCupmBlasIntCast(nrows));
215:   PetscCall(checkCupmBlasIntCast(ncols));
216:   PetscCall(GetHandlesFrom_(dctx, &stream));
217:   if (lda <= 0) lda = nrows;
218:   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
219:   if (user_device_array) {
220:     mcu->d_user_alloc = PETSC_TRUE;
221:     mcu->d_v          = user_device_array;
222:   } else {
223:     std::size_t size;

225:     mcu->d_user_alloc = PETSC_FALSE;
226:     size              = lda * ncols;
227:     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
228:     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
229:   }
230:   m->offloadmask = PETSC_OFFLOAD_GPU;
231:   PetscFunctionReturn(PETSC_SUCCESS);
232: }

234: template <device::cupm::DeviceType T>
235: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
236: {
237:   const auto nrows = m->rmap->n;
238:   const auto ncols = m->cmap->n;
239:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;

241:   PetscFunctionBegin;
242:   PetscCheckTypeName(m, MATSEQDENSECUPM());
243:   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
244:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
245:   if (copy) {
246:     const auto   mcu = MatCUPMCast(m);
247:     cupmStream_t stream;

249:     // Allocate GPU memory if not present
250:     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
251:     PetscCall(GetHandlesFrom_(dctx, &stream));
252:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
253:     {
254:       const auto mimpl = MatIMPLCast(m);
255:       const auto lda   = mimpl->lda;
256:       const auto src   = mimpl->v;
257:       const auto dest  = mcu->d_v;

259:       if (lda > nrows) {
260:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
261:       } else {
262:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
263:       }
264:     }
265:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
266:     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
267:     m->offloadmask = PETSC_OFFLOAD_BOTH;
268:   }
269:   PetscFunctionReturn(PETSC_SUCCESS);
270: }

272: template <device::cupm::DeviceType T>
273: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
274: {
275:   const auto nrows = m->rmap->n;
276:   const auto ncols = m->cmap->n;
277:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;

279:   PetscFunctionBegin;
280:   PetscCheckTypeName(m, MATSEQDENSECUPM());
281:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
282:   if (copy) {
283:     const auto   mimpl = MatIMPLCast(m);
284:     cupmStream_t stream;

286:     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
287:     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
288:     PetscCall(GetHandlesFrom_(dctx, &stream));
289:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
290:     {
291:       const auto lda  = mimpl->lda;
292:       const auto dest = mimpl->v;
293:       const auto src  = MatCUPMCast(m)->d_v;

295:       if (lda > nrows) {
296:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
297:       } else {
298:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
299:       }
300:     }
301:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
302:     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
303:     m->offloadmask = PETSC_OFFLOAD_BOTH;
304:   }
305:   PetscFunctionReturn(PETSC_SUCCESS);
306: }

308: template <device::cupm::DeviceType T>
309: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
310: {
311:   PetscFunctionBegin;
312:   if (PetscDefined(USE_DEBUG)) {
313:     cupmBlasInt_t info = 0;

315:     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
316:     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
317:     static_assert(std::is_same<decltype(info), int>::value, "");
318:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
319:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
320:   }
321:   PetscFunctionReturn(PETSC_SUCCESS);
322: }

324: // ==========================================================================================
325: // MatDense_Seq_CUPM - Private API - Solver Dispatch
326: // ==========================================================================================

328: // specific solvers called through the dispatch_() family of functions
329: template <device::cupm::DeviceType T>
330: template <typename Derived>
331: struct MatDense_Seq_CUPM<T>::SolveCommon {
332:   using derived_type = Derived;

334:   template <typename F>
335:   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
336:   {
337:     cupmBlasInt_t lwork;

339:     PetscFunctionBegin;
340:     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
341:     if (lwork > mcu->d_fact_lwork) {
342:       mcu->d_fact_lwork = lwork;
343:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
344:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
345:     }
346:     PetscFunctionReturn(PETSC_SUCCESS);
347:   }

349:   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
350:   {
351:     const auto mcu = MatCUPMCast(A);

353:     PetscFunctionBegin;
354:     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
355:     A->factortype             = derived_type::MATFACTORTYPE();
356:     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
357:     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
358:     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
359:     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;

361:     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
362:     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
363:     PetscFunctionReturn(PETSC_SUCCESS);
364:   }
365: };

367: template <device::cupm::DeviceType T>
368: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
369:   using base_type = SolveCommon<SolveLU>;

371:   static constexpr const char   *NAME() noexcept { return "LU"; }
372:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }

374:   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
375:   {
376:     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
377:     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
378:     cupmStream_t       stream;
379:     cupmSolverHandle_t handle;
380:     PetscDeviceContext dctx;

382:     PetscFunctionBegin;
383:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
384:     PetscCall(GetHandles_(&dctx, &handle, &stream));
385:     PetscCall(base_type::FactorPrepare(A, stream));
386:     {
387:       const auto mcu = MatCUPMCast(A);
388:       const auto da  = DeviceArrayReadWrite(dctx, A);
389:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

391:       // clang-format off
392:       PetscCall(
393:         base_type::ResizeFactLwork(
394:           mcu, stream,
395:           [&](cupmBlasInt_t *fact_lwork)
396:           {
397:             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
398:           }
399:         )
400:       );
401:       // clang-format on
402:       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));

404:       PetscCall(PetscLogGpuTimeBegin());
405:       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
406:       PetscCall(PetscLogGpuTimeEnd());
407:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
408:     }
409:     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
410:     PetscFunctionReturn(PETSC_SUCCESS);
411:   }

413:   template <bool transpose>
414:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
415:   {
416:     const auto         mcu       = MatCUPMCast(A);
417:     const auto         fact_info = mcu->d_fact_info;
418:     const auto         fact_ipiv = mcu->d_fact_ipiv;
419:     cupmSolverHandle_t handle;

421:     PetscFunctionBegin;
422:     PetscCall(GetHandlesFrom_(dctx, &handle));
423:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
424:     PetscCall(PetscLogGpuTimeBegin());
425:     {
426:       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
427:       const auto     da  = DeviceArrayRead(dctx, A);
428:       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

430:       // clang-format off
431:       PetscCall(
432:         base_type::ResizeFactLwork(
433:           mcu, stream,
434:           [&](cupmBlasInt_t *lwork)
435:           {
436:             return cupmSolverXgetrs_bufferSize(
437:               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
438:             );
439:           }
440:         )
441:       );
442:       // clang-format on
443:       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
444:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
445:     }
446:     PetscCall(PetscLogGpuTimeEnd());
447:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
448:     PetscFunctionReturn(PETSC_SUCCESS);
449:   }
450: };

452: template <device::cupm::DeviceType T>
453: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
454:   using base_type = SolveCommon<SolveCholesky>;

456:   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
457:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }

459:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
460:   {
461:     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
462:     PetscDeviceContext dctx;
463:     cupmSolverHandle_t handle;
464:     cupmStream_t       stream;

466:     PetscFunctionBegin;
467:     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
468:     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
469:     PetscCall(GetHandles_(&dctx, &handle, &stream));
470:     PetscCall(base_type::FactorPrepare(A, stream));
471:     {
472:       const auto mcu = MatCUPMCast(A);
473:       const auto da  = DeviceArrayReadWrite(dctx, A);
474:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

476:       // clang-format off
477:       PetscCall(
478:         base_type::ResizeFactLwork(
479:           mcu, stream,
480:           [&](cupmBlasInt_t *fact_lwork)
481:           {
482:             return cupmSolverXpotrf_bufferSize(
483:               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
484:             );
485:           }
486:         )
487:       );
488:       // clang-format on
489:       PetscCall(PetscLogGpuTimeBegin());
490:       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
491:       PetscCall(PetscLogGpuTimeEnd());
492:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
493:     }
494:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

496: #if 0
497:     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
498:     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
499:     // routines will be available
500:     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
501:     if (!mcu->d_fact_lwork) {
502:       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
503:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
504:     }
505:     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
506:     PetscCall(PetscLogGpuTimeBegin());
507:     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
508:     PetscCall(PetscLogGpuTimeEnd());
509: #endif
510:     PetscFunctionReturn(PETSC_SUCCESS);
511:   }

513:   template <bool transpose>
514:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
515:   {
516:     const auto         mcu       = MatCUPMCast(A);
517:     const auto         fact_info = mcu->d_fact_info;
518:     cupmSolverHandle_t handle;

520:     PetscFunctionBegin;
521:     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
522:     PetscCall(GetHandlesFrom_(dctx, &handle));
523:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
524:     PetscCall(PetscLogGpuTimeBegin());
525:     {
526:       const auto da  = DeviceArrayRead(dctx, A);
527:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

529:       // clang-format off
530:       PetscCall(
531:         base_type::ResizeFactLwork(
532:           mcu, stream,
533:           [&](cupmBlasInt_t *lwork)
534:           {
535:             return cupmSolverXpotrs_bufferSize(
536:               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
537:             );
538:           }
539:         )
540:       );
541:       // clang-format on
542:       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
543:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
544:     }
545:     PetscCall(PetscLogGpuTimeEnd());
546:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
547:     PetscFunctionReturn(PETSC_SUCCESS);
548:   }
549: };

551: template <device::cupm::DeviceType T>
552: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
553:   using base_type = SolveCommon<SolveQR>;

555:   static constexpr const char   *NAME() noexcept { return "QR"; }
556:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }

558:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
559:   {
560:     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
561:     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
562:     const auto         min   = std::min(m, n);
563:     const auto         mimpl = MatIMPLCast(A);
564:     cupmStream_t       stream;
565:     cupmSolverHandle_t handle;
566:     PetscDeviceContext dctx;

568:     PetscFunctionBegin;
569:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
570:     PetscCall(GetHandles_(&dctx, &handle, &stream));
571:     PetscCall(base_type::FactorPrepare(A, stream));
572:     mimpl->rank = min;
573:     {
574:       const auto mcu = MatCUPMCast(A);
575:       const auto da  = DeviceArrayReadWrite(dctx, A);
576:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

578:       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
579:       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
580:       // clang-format off
581:       PetscCall(
582:         base_type::ResizeFactLwork(
583:           mcu, stream,
584:           [&](cupmBlasInt_t *fact_lwork)
585:           {
586:             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
587:           }
588:         )
589:       );
590:       // clang-format on
591:       PetscCall(PetscLogGpuTimeBegin());
592:       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
593:       PetscCall(PetscLogGpuTimeEnd());
594:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
595:     }
596:     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
597:     PetscFunctionReturn(PETSC_SUCCESS);
598:   }

600:   template <bool transpose>
601:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
602:   {
603:     const auto         mimpl      = MatIMPLCast(A);
604:     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
605:     const auto         mcu        = MatCUPMCast(A);
606:     const auto         fact_info  = mcu->d_fact_info;
607:     const auto         fact_tau   = mcu->d_fact_tau;
608:     const auto         fact_work  = mcu->d_fact_work;
609:     const auto         fact_lwork = mcu->d_fact_lwork;
610:     cupmSolverHandle_t solver_handle;
611:     cupmBlasHandle_t   blas_handle;

613:     PetscFunctionBegin;
614:     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
615:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
616:     PetscCall(PetscLogGpuTimeBegin());
617:     {
618:       const auto da  = DeviceArrayRead(dctx, A);
619:       const auto one = cupmScalarCast(1.0);
620:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

622:       if (transpose) {
623:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
624:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
625:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
626:       } else {
627:         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;

629:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
630:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
631:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
632:       }
633:     }
634:     PetscCall(PetscLogGpuTimeEnd());
635:     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
636:     PetscFunctionReturn(PETSC_SUCCESS);
637:   }
638: };

640: template <device::cupm::DeviceType T>
641: template <typename Solver, bool transpose>
642: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
643: {
644:   using namespace vec::cupm;
645:   const auto         pobj_A  = PetscObjectCast(A);
646:   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
647:   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
648:   auto              &workvec = MatCUPMCast(A)->workvec;
649:   PetscScalar       *y_array = nullptr;
650:   PetscDeviceContext dctx;
651:   PetscBool          xiscupm, yiscupm, aiscupm;
652:   bool               use_y_array_directly;
653:   cupmStream_t       stream;

655:   PetscFunctionBegin;
656:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
657:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
658:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
659:   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
660:   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
661:   PetscCall(GetHandles_(&dctx, &stream));
662:   use_y_array_directly = yiscupm && (k >= m);
663:   {
664:     const PetscScalar *x_array;
665:     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
666:     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;

668:     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
669:     // The logic here is to try to minimize the amount of memory copying:
670:     //
671:     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
672:     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
673:     // data in order to copy it into the y array. So the array x will be wherever the data
674:     // already is so that only one memcpy is performed
675:     if (xisdevice) {
676:       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
677:     } else {
678:       PetscCall(VecGetArrayRead(x, &x_array));
679:     }
680:     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
681:     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
682:     if (xisdevice) {
683:       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
684:     } else {
685:       PetscCall(VecRestoreArrayRead(x, &x_array));
686:     }
687:   }

689:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
690:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
691:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));

693:   if (use_y_array_directly) {
694:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
695:   } else {
696:     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
697:     PetscScalar *yv;

699:     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
700:     // There is nothing in the interface to say where the user would like it to end up. So we
701:     // choose the GPU, because it is the faster option
702:     if (yiscupm) {
703:       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
704:     } else {
705:       PetscCall(VecGetArray(y, &yv));
706:     }
707:     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
708:     if (yiscupm) {
709:       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
710:     } else {
711:       PetscCall(VecRestoreArray(y, &yv));
712:     }
713:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
714:   }
715:   PetscFunctionReturn(PETSC_SUCCESS);
716: }

718: template <device::cupm::DeviceType T>
719: template <typename Solver, bool transpose>
720: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
721: {
722:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
723:   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
724:   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
725:   PetscScalar       *y;
726:   PetscBool          biscupm, xiscupm, aiscupm;
727:   PetscDeviceContext dctx;
728:   cupmStream_t       stream;

730:   PetscFunctionBegin;
731:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
732:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
733:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
734:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
735:   PetscCall(GetHandles_(&dctx, &stream));
736:   {
737:     PetscInt n;

739:     PetscCall(MatGetSize(B, nullptr, &n));
740:     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
741:     PetscCall(MatDenseGetLDA(B, &n));
742:     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
743:     PetscCall(MatDenseGetLDA(X, &n));
744:     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
745:   }
746:   {
747:     // The logic here is to try to minimize the amount of memory copying:
748:     //
749:     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
750:     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
751:     // get the data in order to copy it into the y array. So the array b will be wherever the
752:     // data already is so that only one memcpy is performed
753:     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
754:     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
755:     const PetscScalar *b;

757:     if (bisdevice) {
758:       b = DeviceArrayRead(dctx, B);
759:     } else if (biscupm) {
760:       b = HostArrayRead(dctx, B);
761:     } else {
762:       PetscCall(MatDenseGetArrayRead(B, &b));
763:     }

765:     if (ldx < m || !xiscupm) {
766:       // X's array cannot serve as the array (too small or not on device), B's array cannot
767:       // serve as the array (const), so allocate a new array
768:       ldy = m;
769:       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
770:     } else {
771:       // X's array should serve as the array
772:       ldy = ldx;
773:       y   = DeviceArrayWrite(dctx, X);
774:     }
775:     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
776:     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
777:   }

779:   // convert to CUPM twice??????????????????????????????????
780:   // but A should already be CUPM??????????????????????????????????????
781:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
782:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
783:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));

785:   if (ldx < m || !xiscupm) {
786:     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
787:     PetscScalar *x;

789:     // The logic here is that the data is not yet in either X's GPU array or its CPU
790:     // array. There is nothing in the interface to say where the user would like it to end up.
791:     // So we choose the GPU, because it is the faster option
792:     if (xiscupm) {
793:       x = DeviceArrayWrite(dctx, X);
794:     } else {
795:       PetscCall(MatDenseGetArray(X, &x));
796:     }
797:     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
798:     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
799:     PetscCallCUPM(cupmFreeAsync(y, stream));
800:   }
801:   PetscFunctionReturn(PETSC_SUCCESS);
802: }

804: template <device::cupm::DeviceType T>
805: template <bool transpose, bool hermitian>
806: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
807: {
808:   const auto         m   = static_cast<cupmBlasInt_t>(A->rmap->n);
809:   const auto         n   = static_cast<cupmBlasInt_t>(c_end - c_start);
810:   const auto         lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
811:   PetscBool          xiscupm, yiscupm, ziscupm;
812:   cupmBlasHandle_t   handle;
813:   Vec                x = xx, y = yy, z = zz;
814:   PetscDeviceContext dctx;

816:   PetscFunctionBegin;
817:   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(xx), &xiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
818:   if (!xiscupm || xx->boundtocpu) {
819:     PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(xx)), &x));
820:     PetscCall(VecSetLayout(x, xx->map));
821:     PetscCall(VecSetType(x, VecSeq_CUPM::VECCUPM()));
822:     PetscCall(VecCopy(xx, x));
823:   }

825:   if (yy) {
826:     PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(yy), &yiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
827:     if (!yiscupm || yy->boundtocpu) {
828:       PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(yy)), &y));
829:       PetscCall(VecSetLayout(y, yy->map));
830:       PetscCall(VecSetType(y, VecSeq_CUPM::VECCUPM()));
831:       PetscCall(VecCopy(yy, y));
832:     }
833:   }

835:   if (zz != yy) {
836:     PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(zz), &ziscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
837:     if (!ziscupm || zz->boundtocpu) {
838:       PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(zz)), &z));
839:       PetscCall(VecSetLayout(z, zz->map));
840:       PetscCall(VecSetType(z, VecSeq_CUPM::VECCUPM()));
841:     }
842:   } else {
843:     z = y;
844:   }

846:   if (y && y != z) PetscCall(VecSeq_CUPM::Copy(y, z)); // mult add
847:   if (!m || !n) {
848:     // mult only
849:     if (!y) PetscCall(VecSeq_CUPM::Set(z, 0.0));
850:     PetscFunctionReturn(PETSC_SUCCESS);
851:   }
852:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
853:   PetscCall(GetHandles_(&dctx, &handle));
854:   {
855:     constexpr auto op   = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
856:     const auto     one  = cupmScalarCast(1.0);
857:     const auto     zero = cupmScalarCast(0.0);
858:     const auto     da   = DeviceArrayRead(dctx, A);
859:     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, x);
860:     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, z);

862:     PetscCall(PetscLogGpuTimeBegin());
863:     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, y ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
864:     PetscCall(PetscLogGpuTimeEnd());
865:   }
866:   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
867:   if (z != zz) {
868:     PetscCall(VecCopy(z, zz));
869:     if (z != y) PetscCall(VecDestroy(&z));
870:   }
871:   if (y != yy) PetscCall(VecDestroy(&y));
872:   if (x != xx) PetscCall(VecDestroy(&x));
873:   PetscFunctionReturn(PETSC_SUCCESS);
874: }

876: template <device::cupm::DeviceType T>
877: template <bool transpose, bool hermitian>
878: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
879: {
880:   PetscFunctionBegin;
881:   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
882:   PetscFunctionReturn(PETSC_SUCCESS);
883: }

885: template <device::cupm::DeviceType T>
886: template <bool transpose, bool hermitian>
887: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
888: {
889:   PetscFunctionBegin;
890:   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
891:   PetscFunctionReturn(PETSC_SUCCESS);
892: }

894: // ==========================================================================================
895: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
896: // ==========================================================================================

898: template <device::cupm::DeviceType T>
899: template <bool to_host>
900: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
901: {
902:   PetscFunctionBegin;
903:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
904:     // TODO these cases should be optimized
905:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
906:   } else {
907:     const auto B    = *newmat;
908:     const auto pobj = PetscObjectCast(B);

910:     if (to_host) {
911:       PetscCall(BindToCPU(B, PETSC_TRUE));
912:       PetscCall(Reset(B));
913:     } else {
914:       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
915:     }

917:     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
918:     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
919:     // cvec might be the wrong VecType, destroy and rebuild it if necessary
920:     // REVIEW ME: this is possibly very inefficient
921:     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));

923:     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
924:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
925:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
926:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
927:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
928:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
929:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
930:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
931:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
932:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
933:     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
934:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);

936:     if (to_host) {
937:       B->offloadmask = PETSC_OFFLOAD_CPU;
938:     } else {
939:       Mat_SeqDenseCUPM *mcu;

941:       PetscCall(PetscNew(&mcu));
942:       B->spptr       = mcu;
943:       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
944:       PetscCall(BindToCPU(B, PETSC_FALSE));
945:     }

947:     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
948:     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
949:   }
950:   PetscFunctionReturn(PETSC_SUCCESS);
951: }

953: // ==========================================================================================
954: // MatDense_Seq_CUPM - Public API
955: // ==========================================================================================

957: template <device::cupm::DeviceType T>
958: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
959: {
960:   return MATSEQDENSECUPM();
961: }

963: template <device::cupm::DeviceType T>
964: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
965: {
966:   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
967: }

969: template <device::cupm::DeviceType T>
970: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
971: {
972:   return static_cast<Mat_SeqDense *>(m->data);
973: }

975: template <device::cupm::DeviceType T>
976: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
977: {
978:   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
979: }

981: template <device::cupm::DeviceType T>
982: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
983: {
984:   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
985: }

987: // ==========================================================================================

989: // MatCreate_SeqDenseCUPM()
990: template <device::cupm::DeviceType T>
991: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
992: {
993:   PetscFunctionBegin;
994:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
995:   PetscCall(MatCreate_SeqDense(A));
996:   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
997:   PetscFunctionReturn(PETSC_SUCCESS);
998: }

1000: template <device::cupm::DeviceType T>
1001: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
1002: {
1003:   PetscFunctionBegin;
1004:   // prevent copying back data if we own the data pointer
1005:   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1006:   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1007:   PetscCall(MatDestroy_SeqDense(A));
1008:   PetscFunctionReturn(PETSC_SUCCESS);
1009: }

1011: // obj->ops->setup()
1012: template <device::cupm::DeviceType T>
1013: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
1014: {
1015:   PetscFunctionBegin;
1016:   PetscCall(PetscLayoutSetUp(A->rmap));
1017:   PetscCall(PetscLayoutSetUp(A->cmap));
1018:   if (!A->preallocated) {
1019:     PetscDeviceContext dctx;

1021:     PetscCall(GetHandles_(&dctx));
1022:     PetscCall(SetPreallocation(A, dctx, nullptr));
1023:   }
1024:   PetscFunctionReturn(PETSC_SUCCESS);
1025: }

1027: template <device::cupm::DeviceType T>
1028: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
1029: {
1030:   PetscFunctionBegin;
1031:   if (const auto mcu = MatCUPMCast(A)) {
1032:     cupmStream_t stream;

1034:     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1035:     PetscCall(GetHandles_(&stream));
1036:     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1037:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
1038:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1039:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1040:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1041:     PetscCall(VecDestroy(&mcu->workvec));
1042:     PetscCall(PetscFree(A->spptr /* mcu */));
1043:   }
1044:   PetscFunctionReturn(PETSC_SUCCESS);
1045: }

1047: // ==========================================================================================

1049: template <device::cupm::DeviceType T>
1050: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1051: {
1052:   const auto mimpl = MatIMPLCast(A);
1053:   const auto pobj  = PetscObjectCast(A);

1055:   PetscFunctionBegin;
1056:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1057:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1058:   A->boundtocpu = to_host;
1059:   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1060:   if (to_host) {
1061:     PetscDeviceContext dctx;

1063:     // make sure we have an up-to-date copy on the CPU
1064:     PetscCall(GetHandles_(&dctx));
1065:     PetscCall(DeviceToHost_(A, dctx));
1066:   } else {
1067:     PetscBool iscupm;

1069:     if (auto &cvec = mimpl->cvec) {
1070:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1071:       if (!iscupm) PetscCall(VecDestroy(&cvec));
1072:     }
1073:     if (auto &cmat = mimpl->cmat) {
1074:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1075:       if (!iscupm) PetscCall(MatDestroy(&cmat));
1076:     }
1077:   }

1079:   // ============================================================
1080:   // Composed ops
1081:   // ============================================================
1082:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1083:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1084:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1085:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1086:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1087:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1088:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1089:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1090:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1091:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1092:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1093:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1094:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1095:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1096:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1097:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1098:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1099:   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1100:   MatComposeOp_CUPM(to_host, pobj, "MatMultColumnRange_C", MatMultColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1101:   MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1102:   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1103:   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1104:   // always the same
1105:   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));

1107:   // ============================================================
1108:   // Function pointer ops
1109:   // ============================================================
1110:   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1111:   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1112:   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1113:   MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1114:   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1115:   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1116:   MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1117:   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1118:   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1119:   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1120:   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1121:   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1122:   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1123:   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1124:   MatSetOp_CUPM(to_host, A, conjugate, MatConjugate_SeqDense, Conjugate);
1125:   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1126:   MatSetOp_CUPM(to_host, A, diagonalscale, MatDiagonalScale_SeqDense, DiagonalScale);
1127:   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1128:   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1129:   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1130:   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1131:   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1132:   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1133:   // seemingly always the same
1134:   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;

1136:   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1137:   PetscFunctionReturn(PETSC_SUCCESS);
1138: }

1140: template <device::cupm::DeviceType T>
1141: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1142: {
1143:   PetscFunctionBegin;
1144:   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1145:   PetscFunctionReturn(PETSC_SUCCESS);
1146: }

1148: template <device::cupm::DeviceType T>
1149: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1150: {
1151:   PetscFunctionBegin;
1152:   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1153:   PetscFunctionReturn(PETSC_SUCCESS);
1154: }

1156: // ==========================================================================================

1158: template <device::cupm::DeviceType T>
1159: template <PetscMemType mtype, PetscMemoryAccessMode access>
1160: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1161: {
1162:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1163:   constexpr auto read_access = PetscMemoryAccessRead(access);

1165:   PetscFunctionBegin;
1166:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1167:   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1168:   if (hostmem) {
1169:     if (read_access) {
1170:       PetscCall(DeviceToHost_(m, dctx));
1171:     } else if (!MatIMPLCast(m)->v) {
1172:       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1173:       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1174:     }
1175:     *array = MatIMPLCast(m)->v;
1176:   } else {
1177:     if (read_access) {
1178:       PetscCall(HostToDevice_(m, dctx));
1179:     } else if (!MatCUPMCast(m)->d_v) {
1180:       // write-only
1181:       PetscCall(SetPreallocation(m, dctx, nullptr));
1182:     }
1183:     *array = MatCUPMCast(m)->d_v;
1184:   }
1185:   if (PetscMemoryAccessWrite(access)) {
1186:     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1187:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1188:   }
1189:   PetscFunctionReturn(PETSC_SUCCESS);
1190: }

1192: template <device::cupm::DeviceType T>
1193: template <PetscMemType mtype, PetscMemoryAccessMode access>
1194: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1195: {
1196:   PetscFunctionBegin;
1197:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1198:   if (PetscMemoryAccessWrite(access)) {
1199:     // WRITE or READ_WRITE
1200:     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1201:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1202:   }
1203:   if (array) {
1204:     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1205:     *array = nullptr;
1206:   }
1207:   PetscFunctionReturn(PETSC_SUCCESS);
1208: }

1210: template <device::cupm::DeviceType T>
1211: template <PetscMemoryAccessMode access>
1212: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1213: {
1214:   PetscFunctionBegin;
1215:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1216:   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1217:   PetscFunctionReturn(PETSC_SUCCESS);
1218: }

1220: template <device::cupm::DeviceType T>
1221: template <PetscMemoryAccessMode access>
1222: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1223: {
1224:   PetscFunctionBegin;
1225:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1226:   PetscFunctionReturn(PETSC_SUCCESS);
1227: }

1229: // ==========================================================================================

1231: template <device::cupm::DeviceType T>
1232: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1233: {
1234:   const auto mimpl = MatIMPLCast(A);
1235:   const auto mcu   = MatCUPMCast(A);

1237:   PetscFunctionBegin;
1238:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1239:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1240:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1241:   if (mimpl->v) {
1242:     PetscDeviceContext dctx;

1244:     PetscCall(GetHandles_(&dctx));
1245:     PetscCall(HostToDevice_(A, dctx));
1246:   }
1247:   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1248:   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1249:   PetscFunctionReturn(PETSC_SUCCESS);
1250: }

1252: template <device::cupm::DeviceType T>
1253: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1254: {
1255:   const auto mimpl = MatIMPLCast(A);
1256:   const auto mcu   = MatCUPMCast(A);

1258:   PetscFunctionBegin;
1259:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1260:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1261:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1262:   if (!mcu->d_user_alloc) {
1263:     cupmStream_t stream;

1265:     PetscCall(GetHandles_(&stream));
1266:     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1267:   }
1268:   mcu->d_v          = const_cast<PetscScalar *>(array);
1269:   mcu->d_user_alloc = PETSC_FALSE;
1270:   PetscFunctionReturn(PETSC_SUCCESS);
1271: }

1273: template <device::cupm::DeviceType T>
1274: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1275: {
1276:   const auto mimpl = MatIMPLCast(A);
1277:   const auto mcu   = MatCUPMCast(A);

1279:   PetscFunctionBegin;
1280:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1281:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1282:   if (mimpl->v) {
1283:     PetscDeviceContext dctx;

1285:     PetscCall(GetHandles_(&dctx));
1286:     PetscCall(HostToDevice_(A, dctx));
1287:   }
1288:   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1289:   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1290:   PetscFunctionReturn(PETSC_SUCCESS);
1291: }

1293: // ==========================================================================================

1295: template <device::cupm::DeviceType T>
1296: template <bool transpose_A, bool transpose_B>
1297: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1298: {
1299:   cupmBlasInt_t      m, n, k;
1300:   PetscBool          Aiscupm, Biscupm;
1301:   PetscDeviceContext dctx;
1302:   cupmBlasHandle_t   handle;

1304:   PetscFunctionBegin;
1305:   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1306:   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1307:   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1308:   if (!m || !n || !k) {
1309:     PetscCall(ZeroEntries(C));
1310:     PetscFunctionReturn(PETSC_SUCCESS);
1311:   }

1313:   // we may end up with SEQDENSE as one of the arguments
1314:   // REVIEW ME: how? and why is it not B and C????????
1315:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1316:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1317:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1318:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1319:   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1320:   PetscCall(GetHandles_(&dctx, &handle));

1322:   PetscCall(PetscLogGpuTimeBegin());
1323:   {
1324:     const auto one  = cupmScalarCast(1.0);
1325:     const auto zero = cupmScalarCast(0.0);
1326:     const auto da   = DeviceArrayRead(dctx, A);
1327:     const auto db   = DeviceArrayRead(dctx, B);
1328:     const auto dc   = DeviceArrayWrite(dctx, C);
1329:     PetscInt   alda, blda, clda;

1331:     PetscCall(MatDenseGetLDA(A, &alda));
1332:     PetscCall(MatDenseGetLDA(B, &blda));
1333:     PetscCall(MatDenseGetLDA(C, &clda));
1334:     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1335:   }
1336:   PetscCall(PetscLogGpuTimeEnd());

1338:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1339:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1340:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1341:   PetscFunctionReturn(PETSC_SUCCESS);
1342: }

1344: template <device::cupm::DeviceType T>
1345: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1346: {
1347:   const auto m = A->rmap->n;
1348:   const auto n = A->cmap->n;

1350:   PetscFunctionBegin;
1351:   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1352:   // The two matrices must have the same copy implementation to be eligible for fast copy
1353:   if (A->ops->copy == B->ops->copy) {
1354:     PetscDeviceContext dctx;
1355:     cupmStream_t       stream;

1357:     PetscCall(GetHandles_(&dctx, &stream));
1358:     PetscCall(PetscLogGpuTimeBegin());
1359:     {
1360:       const auto va = DeviceArrayRead(dctx, A);
1361:       const auto vb = DeviceArrayWrite(dctx, B);
1362:       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1363:       // lda!
1364:       const auto lda_a = MatIMPLCast(A)->lda;
1365:       const auto lda_b = MatIMPLCast(B)->lda;

1367:       if (lda_a > m || lda_b > m) {
1368:         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1369:         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1370:         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1371:       } else {
1372:         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1373:       }
1374:     }
1375:     PetscCall(PetscLogGpuTimeEnd());
1376:   } else {
1377:     PetscCall(MatCopy_Basic(A, B, str));
1378:   }
1379:   PetscFunctionReturn(PETSC_SUCCESS);
1380: }

1382: template <device::cupm::DeviceType T>
1383: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1384: {
1385:   PetscDeviceContext dctx;
1386:   cupmStream_t       stream;

1388:   PetscFunctionBegin;
1389:   PetscCall(GetHandles_(&dctx, &stream));
1390:   PetscCall(PetscLogGpuTimeBegin());
1391:   {
1392:     const auto va  = DeviceArrayWrite(dctx, m);
1393:     const auto lda = MatIMPLCast(m)->lda;
1394:     const auto ma  = m->rmap->n;
1395:     const auto na  = m->cmap->n;

1397:     if (lda > ma) {
1398:       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1399:     } else {
1400:       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1401:     }
1402:   }
1403:   PetscCall(PetscLogGpuTimeEnd());
1404:   PetscFunctionReturn(PETSC_SUCCESS);
1405: }

1407: namespace detail
1408: {

1410: // ==========================================================================================
1411: // SubMatIndexFunctor
1412: //
1413: // Iterator which permutes a linear index range into matrix indices for an nrows x ncols
1414: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1415: // the i'th sequential entry in the matrix.
1416: // ==========================================================================================
1417: template <typename T>
1418: struct SubMatIndexFunctor {
1419:   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }

1421:   PetscInt nrows;
1422:   PetscInt ncols;
1423:   PetscInt lda;
1424: };

1426: template <typename Iterator>
1427: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>> {
1428:   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>>;

1430:   using iterator = typename base_type::iterator;

1432:   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1433:     base_type{
1434:       std::move(first), std::move(last), {nrows, ncols, lda}
1435:   }
1436:   {
1437:   }

1439:   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1440: };

1442: namespace
1443: {

1445: template <typename T>
1446: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1447: {
1448:   const auto nrows = rend - rstart;
1449:   const auto ncols = cend - cstart;
1450:   const auto dptr  = thrust::device_pointer_cast(ptr);

1452:   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1453: }

1455: } // namespace

1457: struct conjugate {
1458:   PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL PetscScalar operator()(const PetscScalar &x) const noexcept { return PetscConj(x); }
1459: };

1461: } // namespace detail

1463: template <device::cupm::DeviceType T>
1464: inline PetscErrorCode MatDense_Seq_CUPM<T>::Conjugate(Mat A) noexcept
1465: {
1466:   const auto         m = A->rmap->n;
1467:   const auto         n = A->cmap->n;
1468:   const auto         N = m * n;
1469:   PetscDeviceContext dctx;
1470:   cupmStream_t       stream;

1472:   PetscFunctionBegin;
1473:   if (PetscDefined(USE_COMPLEX)) {
1474:     PetscCall(GetHandles_(&dctx, &stream));
1475:     PetscCall(PetscLogGpuTimeBegin());
1476:     {
1477:       const auto   da  = DeviceArrayReadWrite(dctx, A);
1478:       const auto   lda = MatIMPLCast(A)->lda;
1479:       cupmStream_t stream;
1480:       PetscCall(GetHandlesFrom_(dctx, &stream));

1482:       if (lda > m) {
1483:         // clang-format off
1484:         PetscCallThrust(
1485:           const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());

1487:           THRUST_CALL(
1488:             thrust::transform,
1489:             stream,
1490:             sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1491:             detail::conjugate{}
1492:           )
1493:         );
1494:         // clang-format on
1495:       } else {
1496:         // clang-format off
1497:         PetscCallThrust(
1498:           const auto aptr = thrust::device_pointer_cast(da.data());

1500:           THRUST_CALL(
1501:             thrust::transform,
1502:             stream,
1503:             aptr, aptr + N, aptr,
1504:             detail::conjugate{}
1505:           )
1506:         );
1507:         // clang-format on
1508:       }
1509:     }
1510:     PetscCall(PetscLogGpuTimeEnd());
1511:   }
1512:   PetscFunctionReturn(PETSC_SUCCESS);
1513: }

1515: template <device::cupm::DeviceType T>
1516: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1517: {
1518:   const auto         m = A->rmap->n;
1519:   const auto         n = A->cmap->n;
1520:   const auto         N = m * n;
1521:   PetscDeviceContext dctx;

1523:   PetscFunctionBegin;
1524:   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1525:   PetscCall(GetHandles_(&dctx));
1526:   {
1527:     const auto da  = DeviceArrayReadWrite(dctx, A);
1528:     const auto lda = MatIMPLCast(A)->lda;

1530:     if (lda > m) {
1531:       cupmStream_t stream;

1533:       PetscCall(GetHandlesFrom_(dctx, &stream));
1534:       // clang-format off
1535:       PetscCallThrust(
1536:         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());

1538:         THRUST_CALL(
1539:           thrust::transform,
1540:           stream,
1541:           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1542:           device::cupm::functors::make_times_equals(alpha)
1543:         )
1544:       );
1545:       // clang-format on
1546:     } else {
1547:       const auto       cu_alpha = cupmScalarCast(alpha);
1548:       cupmBlasHandle_t handle;

1550:       PetscCall(GetHandlesFrom_(dctx, &handle));
1551:       PetscCall(PetscLogGpuTimeBegin());
1552:       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1553:       PetscCall(PetscLogGpuTimeEnd());
1554:     }
1555:   }
1556:   PetscCall(PetscLogGpuFlops(N));
1557:   PetscFunctionReturn(PETSC_SUCCESS);
1558: }

1560: template <device::cupm::DeviceType T>
1561: inline PetscErrorCode MatDense_Seq_CUPM<T>::DiagonalScale(Mat A, Vec l, Vec r) noexcept
1562: {
1563:   PetscBool          iscupm;
1564:   PetscDeviceContext dctx;
1565:   cupmBlasHandle_t   handle;
1566:   auto               m = A->rmap->n, n = A->cmap->n;

1568:   PetscFunctionBegin;
1569:   if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
1570:   PetscCall(PetscInfo(A, "Performing DiagonalScale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1571:   PetscCall(GetHandles_(&dctx, &handle));
1572:   {
1573:     Vec        lr;
1574:     const auto da  = DeviceArrayReadWrite(dctx, A);
1575:     const auto lda = MatIMPLCast(A)->lda;

1577:     if (l) {
1578:       PetscCall(VecGetSize(l, &m));
1579:       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling Vec of wrong size");
1580:       PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(l), &iscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1581:       if (!iscupm || l->boundtocpu) {
1582:         PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(l)), &lr));
1583:         PetscCall(VecSetLayout(lr, l->map));
1584:         PetscCall(VecSetType(lr, VecSeq_CUPM::VECCUPM()));
1585:         PetscCall(VecCopy(l, lr));
1586:       } else lr = l;
1587:       {
1588:         const auto     dlr  = VecSeq_CUPM::DeviceArrayRead(dctx, lr);
1589:         constexpr auto side = CUPMBLAS_SIDE_LEFT;

1591:         PetscCall(PetscLogGpuTimeBegin());
1592:         PetscCallCUPMBLAS(cupmBlasXdgmm(handle, side, m, n, da.cupmdata(), lda, dlr.cupmdata(), 1, da.cupmdata(), lda));
1593:         PetscCall(PetscLogGpuTimeEnd());
1594:         PetscCall(PetscLogGpuFlops(1.0 * n * m));
1595:       }
1596:       if (!iscupm || l->boundtocpu) PetscCall(VecDestroy(&lr));
1597:     }
1598:     if (r) {
1599:       PetscCall(VecGetSize(r, &n));
1600:       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling Vec of wrong size");
1601:       PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(r), &iscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1602:       if (!iscupm || r->boundtocpu) {
1603:         PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(r)), &lr));
1604:         PetscCall(VecSetLayout(lr, r->map));
1605:         PetscCall(VecSetType(lr, VecSeq_CUPM::VECCUPM()));
1606:         PetscCall(VecCopy(r, lr));
1607:       } else lr = r;
1608:       {
1609:         const auto     dlr  = VecSeq_CUPM::DeviceArrayRead(dctx, lr);
1610:         constexpr auto side = CUPMBLAS_SIDE_RIGHT;

1612:         PetscCall(PetscLogGpuTimeBegin());
1613:         PetscCallCUPMBLAS(cupmBlasXdgmm(handle, side, m, n, da.cupmdata(), lda, dlr.cupmdata(), 1, da.cupmdata(), lda));
1614:         PetscCall(PetscLogGpuTimeEnd());
1615:         PetscCall(PetscLogGpuFlops(1.0 * n * m));
1616:       }
1617:       if (!iscupm || r->boundtocpu) PetscCall(VecDestroy(&lr));
1618:     }
1619:   }
1620:   PetscFunctionReturn(PETSC_SUCCESS);
1621: }

1623: template <device::cupm::DeviceType T>
1624: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1625: {
1626:   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1627:   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1628:   const auto         N = m_x * n_x;
1629:   PetscDeviceContext dctx;

1631:   PetscFunctionBegin;
1632:   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1633:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1634:   PetscCall(GetHandles_(&dctx));
1635:   {
1636:     const auto dx    = DeviceArrayRead(dctx, X);
1637:     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1638:     const auto lda_x = MatIMPLCast(X)->lda;
1639:     const auto lda_y = MatIMPLCast(Y)->lda;

1641:     if (lda_x > m_x || lda_y > m_x) {
1642:       cupmStream_t stream;

1644:       PetscCall(GetHandlesFrom_(dctx, &stream));
1645:       // clang-format off
1646:       PetscCallThrust(
1647:         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1648:         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());

1650:         THRUST_CALL(
1651:           thrust::transform,
1652:           stream,
1653:           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1654:           device::cupm::functors::make_axpy(alpha)
1655:         );
1656:       );
1657:       // clang-format on
1658:     } else {
1659:       const auto       cu_alpha = cupmScalarCast(alpha);
1660:       cupmBlasHandle_t handle;

1662:       PetscCall(GetHandlesFrom_(dctx, &handle));
1663:       PetscCall(PetscLogGpuTimeBegin());
1664:       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1665:       PetscCall(PetscLogGpuTimeEnd());
1666:     }
1667:   }
1668:   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1669:   PetscFunctionReturn(PETSC_SUCCESS);
1670: }

1672: template <device::cupm::DeviceType T>
1673: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1674: {
1675:   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1676:   PetscDeviceContext dctx;

1678:   PetscFunctionBegin;
1679:   PetscCall(GetHandles_(&dctx));
1680:   // do not call SetPreallocation() yet, we call it afterwards??
1681:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1682:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1683:   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1684:   // allocate memory if needed
1685:   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1686:   PetscFunctionReturn(PETSC_SUCCESS);
1687: }

1689: template <device::cupm::DeviceType T>
1690: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1691: {
1692:   PetscBool device_rand_is_rander48;
1693:   PetscBool device = PETSC_FALSE;

1695:   PetscFunctionBegin;
1696:   // CUPMObject::PETSCDEVICERAD() is PETSCRANDER48 until PetscRandom is implemented for hiprand
1697:   PetscCall(PetscStrncmp(PETSCDEVICERAND(), PETSCRANDER48, sizeof(PETSCRANDER48), &device_rand_is_rander48));
1698:   if (!device_rand_is_rander48) PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1699:   if (device) {
1700:     const auto         m = A->rmap->n;
1701:     const auto         n = A->cmap->n;
1702:     PetscDeviceContext dctx;

1704:     PetscCall(GetHandles_(&dctx));
1705:     {
1706:       const auto a = DeviceArrayWrite(dctx, A);
1707:       PetscInt   lda;

1709:       PetscCall(MatDenseGetLDA(A, &lda));
1710:       if (lda > m) {
1711:         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1712:       } else {
1713:         PetscInt mn;

1715:         PetscCall(PetscIntMultError(m, n, &mn));
1716:         PetscCall(PetscRandomGetValues(rng, mn, a));
1717:       }
1718:     }
1719:   } else {
1720:     PetscCall(MatSetRandom_SeqDense(A, rng));
1721:   }
1722:   PetscFunctionReturn(PETSC_SUCCESS);
1723: }

1725: // ==========================================================================================

1727: template <device::cupm::DeviceType T>
1728: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1729: {
1730:   const auto         offloadmask = A->offloadmask;
1731:   const auto         n           = A->rmap->n;
1732:   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1733:   PetscBool          viscupm;
1734:   PetscDeviceContext dctx;
1735:   cupmStream_t       stream;

1737:   PetscFunctionBegin;
1738:   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1739:   PetscCall(GetHandles_(&dctx, &stream));
1740:   if (viscupm && !v->boundtocpu) {
1741:     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);

1743:     // update device data
1744:     if (PetscOffloadDevice(offloadmask)) {
1745:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1746:     } else {
1747:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1748:     }
1749:   } else {
1750:     PetscScalar *x;

1752:     // update host data
1753:     PetscCall(VecGetArrayWrite(v, &x));
1754:     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1755:       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1756:     } else if (PetscOffloadDevice(offloadmask)) {
1757:       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1758:     }
1759:     PetscCall(VecRestoreArrayWrite(v, &x));
1760:   }
1761:   PetscFunctionReturn(PETSC_SUCCESS);
1762: }

1764: template <device::cupm::DeviceType T>
1765: template <PetscMemoryAccessMode access>
1766: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1767: {
1768:   using namespace vec::cupm;
1769:   const auto         mimpl = MatIMPLCast(A);
1770:   PetscDeviceContext dctx;

1772:   PetscFunctionBegin;
1773:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1774:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1775:   mimpl->vecinuse = col + 1;
1776:   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1777:   PetscCall(GetHandles_(&dctx));
1778:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1779:   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1780:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1781:   *v = mimpl->cvec;
1782:   PetscFunctionReturn(PETSC_SUCCESS);
1783: }

1785: template <device::cupm::DeviceType T>
1786: template <PetscMemoryAccessMode access>
1787: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1788: {
1789:   using namespace vec::cupm;
1790:   const auto         mimpl = MatIMPLCast(A);
1791:   const auto         cvec  = mimpl->cvec;
1792:   PetscDeviceContext dctx;

1794:   PetscFunctionBegin;
1795:   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1796:   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1797:   mimpl->vecinuse = 0;
1798:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1799:   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1800:   PetscCall(GetHandles_(&dctx));
1801:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1802:   if (v) *v = nullptr;
1803:   PetscFunctionReturn(PETSC_SUCCESS);
1804: }

1806: // ==========================================================================================

1808: template <device::cupm::DeviceType T>
1809: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1810: {
1811:   Mat                fact = nullptr;
1812:   PetscDeviceContext dctx;

1814:   PetscFunctionBegin;
1815:   PetscCall(GetHandles_(&dctx));
1816:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1817:   fact->factortype = ftype;
1818:   switch (ftype) {
1819:   case MAT_FACTOR_LU:
1820:   case MAT_FACTOR_ILU: // fall-through
1821:     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1822:     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1823:     break;
1824:   case MAT_FACTOR_CHOLESKY:
1825:   case MAT_FACTOR_ICC: // fall-through
1826:     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1827:     break;
1828:   case MAT_FACTOR_QR: {
1829:     const auto pobj = PetscObjectCast(fact);

1831:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1832:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1833:   } break;
1834:   case MAT_FACTOR_NONE:
1835:   case MAT_FACTOR_ILUDT:     // fall-through
1836:   case MAT_FACTOR_NUM_TYPES: // fall-through
1837:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1838:   }
1839:   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1840:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1841:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1842:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1843:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1844:   *fact_out = fact;
1845:   PetscFunctionReturn(PETSC_SUCCESS);
1846: }

1848: template <device::cupm::DeviceType T>
1849: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1850: {
1851:   const auto         mimpl = MatIMPLCast(A);
1852:   const auto         mcu   = MatCUPMCast(A);
1853:   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1854:   cupmSolverHandle_t handle;
1855:   PetscDeviceContext dctx;
1856:   cupmStream_t       stream;

1858:   PetscFunctionBegin;
1859: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1860:   // HIP appears to have this by default??
1861:   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1862: #endif
1863:   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1864:   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1865:   // spd
1866:   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());

1868:   PetscCall(GetHandles_(&dctx, &handle, &stream));
1869:   {
1870:     const auto    da  = DeviceArrayReadWrite(dctx, A);
1871:     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1872:     cupmBlasInt_t il;

1874:     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1875:     if (il > mcu->d_fact_lwork) {
1876:       mcu->d_fact_lwork = il;
1877:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1878:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1879:     }
1880:     PetscCall(PetscLogGpuTimeBegin());
1881:     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1882:     PetscCall(PetscLogGpuTimeEnd());
1883:   }
1884:   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1885:   // TODO (write cuda kernel)
1886:   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1887:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

1889:   A->ops->solve          = nullptr;
1890:   A->ops->solvetranspose = nullptr;
1891:   A->ops->matsolve       = nullptr;
1892:   A->factortype          = MAT_FACTOR_NONE;

1894:   PetscCall(PetscFree(A->solvertype));
1895:   PetscFunctionReturn(PETSC_SUCCESS);
1896: }

1898: // ==========================================================================================

1900: template <device::cupm::DeviceType T>
1901: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1902: {
1903:   const auto         mimpl        = MatIMPLCast(A);
1904:   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1905:   const auto         n            = rend - rbegin;
1906:   const auto         m            = cend - cbegin;
1907:   auto              &cmat         = mimpl->cmat;
1908:   PetscDeviceContext dctx;

1910:   PetscFunctionBegin;
1911:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1912:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1913:   mimpl->matinuse = cbegin + 1;

1915:   PetscCall(GetHandles_(&dctx));
1916:   PetscCall(HostToDevice_(A, dctx));

1918:   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1919:   {
1920:     const auto device_array = array_offset(MatCUPMCast(A)->d_v);

1922:     if (cmat) {
1923:       PetscCall(PlaceArray(cmat, device_array));
1924:     } else {
1925:       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1926:     }
1927:   }
1928:   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1929:   // place CPU array if present but do not copy any data
1930:   if (const auto host_array = mimpl->v) {
1931:     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1932:     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1933:   }

1935:   cmat->offloadmask = A->offloadmask;
1936:   *mat              = cmat;
1937:   PetscFunctionReturn(PETSC_SUCCESS);
1938: }

1940: template <device::cupm::DeviceType T>
1941: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1942: {
1943:   const auto mimpl = MatIMPLCast(A);
1944:   const auto cmat  = mimpl->cmat;
1945:   const auto reset = static_cast<bool>(mimpl->v);
1946:   bool       copy, was_offload_host;

1948:   PetscFunctionBegin;
1949:   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1950:   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1951:   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1952:   mimpl->matinuse = 0;

1954:   // calls to ResetArray may change it, so save it here
1955:   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1956:   if (was_offload_host && !reset) {
1957:     copy = true;
1958:     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1959:   } else {
1960:     copy = false;
1961:   }

1963:   PetscCall(ResetArray(cmat));
1964:   if (reset) PetscCall(MatDenseResetArray(cmat));
1965:   if (copy) {
1966:     PetscDeviceContext dctx;

1968:     PetscCall(GetHandles_(&dctx));
1969:     PetscCall(DeviceToHost_(A, dctx));
1970:   } else {
1971:     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1972:   }

1974:   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1975:   *m                = nullptr;
1976:   PetscFunctionReturn(PETSC_SUCCESS);
1977: }

1979: // ==========================================================================================

1981: namespace
1982: {

1984: template <device::cupm::DeviceType T>
1985: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1986: {
1987:   PetscFunctionBegin;
1988:   if (TA) {
1989:     if (TB) {
1990:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1991:     } else {
1992:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1993:     }
1994:   } else {
1995:     if (TB) {
1996:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1997:     } else {
1998:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1999:     }
2000:   }
2001:   PetscFunctionReturn(PETSC_SUCCESS);
2002: }

2004: template <device::cupm::DeviceType T>
2005: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
2006: {
2007:   PetscFunctionBegin;
2008:   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
2009:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
2010:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
2011:   }
2012:   PetscFunctionReturn(PETSC_SUCCESS);
2013: }

2015: } // anonymous namespace

2017: } // namespace impl

2019: } // namespace cupm

2021: } // namespace mat

2023: } // namespace Petsc