Actual source code: cupminterface.hpp
1: #ifndef PETSCCUPMINTERFACE_HPP
2: #define PETSCCUPMINTERFACE_HPP
4: #if defined(__cplusplus)
5: #include <petsc/private/cpputil.hpp>
6: #include <petsc/private/petscadvancedmacros.h>
7: #include <petscdevice_cupm.h>
9: #include <array>
11: namespace Petsc
12: {
14: namespace device
15: {
17: namespace cupm
18: {
20: // enum describing available cupm devices, this is used as the template parameter to any
21: // class subclassing the Interface or using it as a member variable
22: enum class DeviceType : int {
23: CUDA,
24: HIP
25: };
27: static constexpr std::array<const char *const, 5> DeviceTypes = {"cuda", "hip", "Petsc::Device::CUPM::DeviceType", "Petsc::Device::CUPM::DeviceType::", nullptr};
29: namespace impl
30: {
32: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
33: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
34: // functions can also be wrapped inline:
35: //
36: // foo<int,char,bool>();
37: #define PetscCallCUPM(...) \
38: do { \
39: const cupmError_t cerr_p_ = __VA_ARGS__; \
41: } while (0)
43: #define PetscCallCUPMAbort(comm_, ...) \
44: do { \
45: const cupmError_t cerr_p_ = __VA_ARGS__; \
47: } while (0)
49: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT() - declaration to alias a cuda/hip integral constant
50: // value
51: //
52: // input params:
53: // our_prefix - the prefix of the alias
54: // our_suffix - the suffix of the alias
55: // their_prefix - the prefix of the variable being aliased
56: // their_suffix - the suffix of the variable being aliased
57: //
58: // example usage:
59: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,cuda,AllGood); ->
60: // static const auto cupmSuccess = cudaAllGood;
61: //
62: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,hip,AllRight); ->
63: // static const auto cupmSuccess = hipAllRight;
64: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) static const auto PetscConcat(our_prefix, our_suffix) = PetscConcat(their_prefix, their_suffix)
66: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON() - declaration to alias a cuda/hip integral constant
67: // value
68: //
69: // input params:
70: // our_suffix - the suffix of the alias
71: // their_suffix - the suffix of the variable being aliased
72: //
73: // notes:
74: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
75: //
76: // example usage:
77: // #define PETSC_CUPM_PREFIX_L cuda
78: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllGood); ->
79: // static const auto cupmSuccess = cudaAllGood;
80: //
81: // #define PETSC_CUPM_PREFIX_L hip
82: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllRight); ->
83: // static const auto cupmSuccess = hipAllRight;
84: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)
86: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE() - declaration to alias a cuda/hip integral constant value
87: //
88: // input param:
89: // suffix - the common suffix shared between cuda, hip, and cupm
90: //
91: // notes:
92: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
93: //
94: // example usage:
95: // #define PETSC_CUPM_PREFIX_L cuda
96: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = cudaSuccess;
97: //
98: // #define PETSC_CUPM_PREFIX_L hip
99: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = hipSuccess;
100: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE(suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(suffix, suffix)
102: // PETSC_CUPM_ALIAS_FUNCTION_EXACT() - declaration to alias a cuda/hip function
103: //
104: // input params:
105: // our_prefix - the prefix of the alias
106: // our_suffix - the suffix of the alias
107: // their_prefix - the prefix of the function being aliased
108: // their_suffix - the suffix of the function being aliased
109: //
110: // notes:
111: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
112: //
113: // example usage:
114: // PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm,Malloc,cuda,Malloc) ->
115: // template <typename... T>
116: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
117: // {
118: // return cudaMalloc(std::forward<T>(args)...);
119: // }
120: #define PETSC_CUPM_ALIAS_FUNCTION_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) PETSC_ALIAS_FUNCTION(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix))
122: // PETSC_CUPM_ALIAS_FUNCTION_COMMON() - declaration to alias a cuda/hip function
123: //
124: // input params:
125: // our_suffix - the suffix of the alias
126: // their_suffix - the common suffix of the cuda/hip function being aliased
127: //
128: // notes:
129: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
130: // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
131: //
132: // example usage:
133: // #define PETSC_CUPM_PREFIX_L cuda
134: // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
135: // template <typename... T>
136: // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
137: // {
138: // return cudaMalloc(std::forward<T>(args)...);
139: // }
140: //
141: // #define PETSC_CUPM_PREFIX_L hip
142: // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
143: // template <typename... T>
144: // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
145: // {
146: // return hipMalloc(std::forward<T>(args)...);
147: // }
148: #define PETSC_CUPM_ALIAS_FUNCTION_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)
150: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
151: //
152: // input param:
153: // suffix - the common suffix for hip, cuda and the alias
154: //
155: // notes:
156: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
157: // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
158: //
159: // example usage:
160: // #define PETSC_CUPM_PREFIX_L cuda
161: // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
162: // template <typename... T>
163: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
164: // {
165: // return cudaMalloc(std::forward<T>(args)...);
166: // }
167: //
168: // #define PETSC_CUPM_PREFIX_L hip
169: // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
170: // template <typename... T>
171: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
172: // {
173: // return hipMalloc(std::forward<T>(args)...);
174: // }
175: #define PETSC_CUPM_ALIAS_FUNCTION(suffix) PETSC_CUPM_ALIAS_FUNCTION_COMMON(suffix, suffix)
177: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT() - declaration to alias a cuda/hip function but
178: // discard the last N arguments
179: //
180: // input params:
181: // our_prefix - the prefix of the alias
182: // our_suffix - the suffix of the alias
183: // their_prefix - the prefix of the function being aliased
184: // their_suffix - the suffix of the function being aliased
185: // N - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
186: //
187: // notes:
188: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
189: //
190: // example use:
191: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm,MallocAsync,cuda,Malloc,1) ->
192: // template <typename... T, typename Tend>
193: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
194: // return type deduction*
195: // {
196: // (void)argend;
197: // return cudaMalloc(std::forward<T>(args)...);
198: // }
199: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix), N)
201: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON() - declaration to alias a cuda/hip function but
202: // discard the last N arguments
203: //
204: // input params:
205: // our_suffix - the suffix of the alias
206: // their_suffix - the suffix of the function being aliased
207: // N - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
208: //
209: // notes:
210: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
211: // aliased. see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the
212: // expansion
213: //
214: // example use:
215: // #define PETSC_CUPM_PREFIX_L cuda
216: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync,Malloc,1) ->
217: // template <typename... T, typename Tend>
218: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
219: // return type deduction*
220: // {
221: // (void)argend;
222: // return cudaMalloc(std::forward<T>(args)...);
223: // }
224: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(our_suffix, their_suffix, N) PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix, N)
226: // Base class that holds functions and variables that don't require CUDA or HIP to be present
227: // on the system
228: template <DeviceType T>
229: struct InterfaceBase {
230: static const DeviceType type = T;
232: PETSC_CXX_COMPAT_DECL(constexpr const char *cupmName())
233: {
234: static_assert(util::integral_value(DeviceType::CUDA) == 0, "");
235: static_assert(util::integral_value(DeviceType::HIP) == 1, "");
236: return std::get<util::integral_value(T)>(DeviceTypes);
237: }
239: PETSC_CXX_COMPAT_DECL(constexpr auto PETSC_DEVICE_CUPM())
240: PETSC_DECLTYPE_AUTO_RETURNS(T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP)
242: PETSC_CXX_COMPAT_DECL(constexpr auto PETSC_MEMTYPE_CUPM())
243: PETSC_DECLTYPE_AUTO_RETURNS(T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP)
244: };
246: // declare the base class static member variables
247: template <DeviceType T>
248: const DeviceType InterfaceBase<T>::type;
250: #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
251: using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
252: using base_name::type; \
253: using base_name::cupmName; \
254: using base_name::PETSC_DEVICE_CUPM; \
255: using base_name::PETSC_MEMTYPE_CUPM
257: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
258: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
259: // import them into classes.
260: template <DeviceType>
261: struct InterfaceImpl;
263: #if PetscDefined(HAVE_CUDA)
264: #define PETSC_CUPM_PREFIX_L cuda
265: #define PETSC_CUPM_PREFIX_U CUDA
266: template <>
267: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
268: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);
270: // typedefs
271: using cupmError_t = cudaError_t;
272: using cupmEvent_t = cudaEvent_t;
273: using cupmStream_t = cudaStream_t;
274: using cupmDeviceProp_t = cudaDeviceProp;
275: using cupmMemcpyKind_t = cudaMemcpyKind;
276: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
277: using cupmPointerAttributes_t = struct cudaPointerAttributes;
278: using cupmMemoryType_t = enum cudaMemoryType;
279: using cupmDim3 = dim3;
280: using cupmHostFn_t = cudaHostFn_t;
281: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
282: using cupmMemPool_t = cudaMemPool_t;
283: using cupmMemPoolAttr = cudaMemPoolAttr;
284: #else
285: using cupmMemPool_t = void *;
286: using cupmMemPoolAttr = unsigned int;
287: #endif
289: // values
290: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
291: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
292: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorDeviceAlreadyInUse);
293: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
294: #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
295: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorStubLibrary);
296: #else
297: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
298: #endif
299: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
300: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
301: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
302: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
303: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
304: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
305: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
306: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
307: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
308: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
309: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
310: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeManaged);
311: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
312: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocDefault);
313: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocWriteCombined);
314: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
315: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
316: #else
317: static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
318: #endif
320: // error functions
321: PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
322: PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
323: PETSC_CUPM_ALIAS_FUNCTION(GetLastError)
325: // device management
326: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
327: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
328: PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
329: PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
330: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
331: PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
332: PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
333: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
334: PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
335: PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
336: #else
337: PETSC_CXX_COMPAT_DECL(cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int))
338: {
339: *pool = nullptr;
340: return cupmSuccess;
341: }
343: PETSC_CXX_COMPAT_DECL(cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *)) { return cupmSuccess; }
344: #endif
345: // CUDA has no cudaInit() to match hipInit()
346: PETSC_CXX_COMPAT_DECL(cupmError_t cupmInit(unsigned int)) { return cudaFree(nullptr); }
348: // stream management
349: PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
350: PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
351: PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
352: PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
353: PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
354: PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
355: PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
356: PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
357: PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
358: PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
359: PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
360: PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
361: PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
362: PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
363: PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
364: PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)
366: // memory management
367: PETSC_CUPM_ALIAS_FUNCTION(Free)
368: PETSC_CUPM_ALIAS_FUNCTION(Malloc)
369: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
370: PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
371: PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
372: #else
373: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
374: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
375: #endif
376: PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
377: PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
378: PETSC_CUPM_ALIAS_FUNCTION(MallocHost)
379: PETSC_CUPM_ALIAS_FUNCTION(FreeHost)
380: PETSC_CUPM_ALIAS_FUNCTION(Memset)
381: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
382: PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
383: #else
384: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MemsetAsync, Memset, 1)
385: #endif
387: // launch control
388: PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
389: template <typename FunctionT, typename... KernelArgsT>
390: PETSC_CXX_COMPAT_DECL(cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs))
391: {
392: void *args[] = {(void *)&kernelArgs...};
393: return cudaLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
394: }
395: };
396: #undef PETSC_CUPM_PREFIX_L
397: #undef PETSC_CUPM_PREFIX_U
398: #endif // PetscDefined(HAVE_CUDA)
400: #if PetscDefined(HAVE_HIP)
401: #define PETSC_CUPM_PREFIX_L hip
402: #define PETSC_CUPM_PREFIX_U HIP
403: template <>
404: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
405: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);
407: // typedefs
408: using cupmError_t = hipError_t;
409: using cupmEvent_t = hipEvent_t;
410: using cupmStream_t = hipStream_t;
411: using cupmDeviceProp_t = hipDeviceProp_t;
412: using cupmMemcpyKind_t = hipMemcpyKind;
413: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
414: using cupmPointerAttributes_t = hipPointerAttribute_t;
415: using cupmMemoryType_t = enum hipMemoryType;
416: using cupmDim3 = dim3;
417: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
418: using cupmHostFn_t = hipHostFn_t;
419: using cupmMemPool_t = hipMemPool_t;
420: using cupmMemPoolAttr = hipMemPoolAttr;
421: #else
422: using cupmHostFn_t = void (*)(void *);
423: using cupmMemPool_t = void *;
424: using cupmMemPoolAttr = unsigned int;
425: #endif
427: // values
428: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
429: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
430: // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
431: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorDeviceAlreadyInUse, ErrorContextAlreadyInUse);
432: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
433: // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
434: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
435: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
436: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
437: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
438: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
439: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
440: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
441: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
442: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
443: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
444: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
445: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
446: // see
447: // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
448: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(MemoryTypeManaged, MemoryTypeUnified);
449: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
450: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocDefault, HostMallocDefault);
451: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocWriteCombined, HostMallocWriteCombined);
452: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
453: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
454: #else
455: static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
456: #endif
458: // error functions
459: PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
460: PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
461: PETSC_CUPM_ALIAS_FUNCTION(GetLastError)
463: // device management
464: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
465: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
466: PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
467: PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
468: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
469: PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
470: PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
471: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
472: PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
473: PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
474: #else
475: PETSC_CXX_COMPAT_DECL(cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int))
476: {
477: *pool = nullptr;
478: return cupmSuccess;
479: }
481: PETSC_CXX_COMPAT_DECL(cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *)) { return cupmSuccess; }
482: #endif
483: PETSC_CUPM_ALIAS_FUNCTION(Init)
485: // stream management
486: PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
487: PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
488: PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
489: PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
490: PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
491: PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
492: PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
493: PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
494: PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
495: PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
496: PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
497: PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
498: PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
499: PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
500: PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
501: PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)
503: // memory management
504: PETSC_CUPM_ALIAS_FUNCTION(Free)
505: PETSC_CUPM_ALIAS_FUNCTION(Malloc)
506: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
507: PETSC_CUPM_ALIAS_FUNCTION(MallocAsync);
508: PETSC_CUPM_ALIAS_FUNCTION(FreeAsync);
509: #else
510: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
511: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
512: #endif
513: PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
514: PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
515: // hipMallocHost is deprecated
516: PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocHost, HostMalloc)
517: // hipFreeHost is deprecated
518: PETSC_CUPM_ALIAS_FUNCTION_COMMON(FreeHost, HostFree)
519: PETSC_CUPM_ALIAS_FUNCTION(Memset)
520: PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
522: // launch control
523: // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
524: // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
525: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
526: PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc);
527: #else
528: PETSC_CXX_COMPAT_DECL(hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx))
529: {
530: // the only correct way to spoof this function is to do it synchronously...
531: auto herr = hipStreamSynchronize(stream);
532: if (PetscUnlikely(herr != hipSuccess)) return herr;
533: fn(ctx);
534: return herr;
535: }
536: #endif
538: template <typename FunctionT, typename... KernelArgsT>
539: PETSC_CXX_COMPAT_DECL(hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs))
540: {
541: void *args[] = {(void *)&kernelArgs...};
542: return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
543: }
544: };
545: #undef PETSC_CUPM_PREFIX_L
546: #undef PETSC_CUPM_PREFIX_U
547: #endif // PetscDefined(HAVE_HIP)
549: // shorthand for bringing all of the typedefs from the base Interface class into your own,
550: // it's annoying that c++ doesn't have a way to do this automatically
551: #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
552: PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
553: using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
554: /* types */ \
555: using typename base_name::cupmComplex_t; \
556: using typename base_name::cupmError_t; \
557: using typename base_name::cupmEvent_t; \
558: using typename base_name::cupmStream_t; \
559: using typename base_name::cupmDeviceProp_t; \
560: using typename base_name::cupmMemcpyKind_t; \
561: using typename base_name::cupmPointerAttributes_t; \
562: using typename base_name::cupmMemoryType_t; \
563: using typename base_name::cupmDim3; \
564: using typename base_name::cupmMemPool_t; \
565: using typename base_name::cupmMemPoolAttr; \
566: /* variables */ \
567: using base_name::cupmSuccess; \
568: using base_name::cupmErrorNotReady; \
569: using base_name::cupmErrorDeviceAlreadyInUse; \
570: using base_name::cupmErrorSetOnActiveProcess; \
571: using base_name::cupmErrorStubLibrary; \
572: using base_name::cupmErrorNoDevice; \
573: using base_name::cupmStreamDefault; \
574: using base_name::cupmStreamNonBlocking; \
575: using base_name::cupmDeviceMapHost; \
576: using base_name::cupmMemcpyHostToDevice; \
577: using base_name::cupmMemcpyDeviceToHost; \
578: using base_name::cupmMemcpyDeviceToDevice; \
579: using base_name::cupmMemcpyHostToHost; \
580: using base_name::cupmMemcpyDefault; \
581: using base_name::cupmMemoryTypeHost; \
582: using base_name::cupmMemoryTypeDevice; \
583: using base_name::cupmMemoryTypeManaged; \
584: using base_name::cupmEventDisableTiming; \
585: using base_name::cupmHostAllocDefault; \
586: using base_name::cupmHostAllocWriteCombined; \
587: using base_name::cupmMemPoolAttrReleaseThreshold; \
588: /* functions */ \
589: using base_name::cupmGetErrorName; \
590: using base_name::cupmGetErrorString; \
591: using base_name::cupmGetLastError; \
592: using base_name::cupmGetDeviceCount; \
593: using base_name::cupmGetDeviceProperties; \
594: using base_name::cupmGetDevice; \
595: using base_name::cupmSetDevice; \
596: using base_name::cupmGetDeviceFlags; \
597: using base_name::cupmSetDeviceFlags; \
598: using base_name::cupmPointerGetAttributes; \
599: using base_name::cupmDeviceGetMemPool; \
600: using base_name::cupmMemPoolSetAttribute; \
601: using base_name::cupmInit; \
602: using base_name::cupmEventCreate; \
603: using base_name::cupmEventCreateWithFlags; \
604: using base_name::cupmEventDestroy; \
605: using base_name::cupmEventRecord; \
606: using base_name::cupmEventSynchronize; \
607: using base_name::cupmEventElapsedTime; \
608: using base_name::cupmEventQuery; \
609: using base_name::cupmStreamCreate; \
610: using base_name::cupmStreamCreateWithFlags; \
611: using base_name::cupmStreamGetFlags; \
612: using base_name::cupmStreamDestroy; \
613: using base_name::cupmStreamWaitEvent; \
614: using base_name::cupmStreamQuery; \
615: using base_name::cupmStreamSynchronize; \
616: using base_name::cupmDeviceSynchronize; \
617: using base_name::cupmGetSymbolAddress; \
618: using base_name::cupmMalloc; \
619: using base_name::cupmMallocAsync; \
620: using base_name::cupmMemcpy; \
621: using base_name::cupmMemcpyAsync; \
622: using base_name::cupmMallocHost; \
623: using base_name::cupmMemset; \
624: using base_name::cupmMemsetAsync; \
625: using base_name::cupmLaunchHostFunc
627: template <DeviceType>
628: struct Interface;
630: // The actual interface class
631: template <DeviceType T>
632: struct Interface : InterfaceImpl<T> {
633: PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);
635: using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
636: using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
638: // REVIEW ME: this needs to be cleaned up, it is unreadable
639: PETSC_CXX_COMPAT_DECL(constexpr auto makeCupmScalar(PetscScalar s))
640: PETSC_DECLTYPE_AUTO_RETURNS(PetscIfPetscDefined(USE_COMPLEX, (cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)}), static_cast<cupmReal_t>(s)));
642: PETSC_CXX_COMPAT_DECL(constexpr auto cupmScalarCast(const PetscScalar *s))
643: PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<const cupmScalar_t *>(s));
645: PETSC_CXX_COMPAT_DECL(constexpr auto cupmScalarCast(PetscScalar *s))
646: PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<cupmScalar_t *>(s));
648: PETSC_CXX_COMPAT_DECL(constexpr auto cupmRealCast(PetscReal *s))
649: PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<cupmReal_t *>(s));
651: PETSC_CXX_COMPAT_DECL(constexpr auto cupmRealCast(const PetscReal *s))
652: PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<const cupmReal_t *>(s));
654: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
655: #define PETSC_PKG_CUDA_VERSION_GE(...) 0
656: #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
657: #endif
658: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr))
659: {
660: cupmPointerAttributes_t attr;
661: cupmError_t cerr;
664: if (registered) {
666: *registered = PETSC_FALSE;
667: }
668: if (managed) {
670: *managed = PETSC_FALSE;
671: }
672: // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
673: // a host pointer returns cudaErrorInvalidValue
674: cerr = cupmPointerGetAttributes(&attr, data);
675: cerr = cupmGetLastError();
676: // HIP seems to always have used memoryType though
677: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
678: const auto mtype = attr.memoryType;
679: if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
680: #else
681: if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) cerr;
682: const auto mtype = attr.type;
683: if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
684: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
685: if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
686: if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
687: return 0;
688: }
689: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
690: #undef PETSC_PKG_CUDA_VERSION_GE
691: #endif
693: PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode))
694: {
695: switch (mode) {
696: case PETSC_DEVICE_COPY_HTOH:
697: return cupmMemcpyHostToHost;
698: case PETSC_DEVICE_COPY_HTOD:
699: return cupmMemcpyHostToDevice;
700: case PETSC_DEVICE_COPY_DTOD:
701: return cupmMemcpyDeviceToDevice;
702: case PETSC_DEVICE_COPY_DTOH:
703: return cupmMemcpyDeviceToHost;
704: case PETSC_DEVICE_COPY_AUTO:
705: return cupmMemcpyDefault;
706: }
707: PetscUnreachable();
708: return cupmMemcpyDefault;
709: }
711: // these change what the arguments mean, so need to namespace these
712: template <typename M>
713: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr))
714: {
715: static_assert(!std::is_void<M>::value, "");
718: if (PetscLikely(n)) {
719: cupmMallocAsync(reinterpret_cast<void **>(ptr), n * sizeof(M), stream);
720: } else {
721: *ptr = nullptr;
722: }
723: return 0;
724: }
726: template <typename M>
727: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n))
728: {
729: PetscCUPMMallocAsync(ptr, n);
730: return 0;
731: }
733: template <typename M>
734: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault))
735: {
736: static_assert(!std::is_void<M>::value, "");
739: *ptr = nullptr;
740: cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags);
741: return 0;
742: }
744: template <typename D, typename S = D>
745: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false))
746: {
747: static_assert(sizeof(D) == sizeof(S), "");
748: static_assert(!std::is_void<D>::value && !std::is_void<S>::value, "");
749: const auto size = n * sizeof(D);
751: if (PetscUnlikely(!n)) return 0;
755: // do early return after nullptr check since we need to check that they arent both nullptrs
756: if (PetscUnlikely(dest == src)) return 0;
757: if (kind == cupmMemcpyHostToHost) {
758: if (cupmStreamQuery(stream) == cupmSuccess) {
759: PetscMemcpy(dest, src, size);
760: return 0;
761: }
762: cupmGetLastError();
763: }
764: if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
765: cupmMemcpyAsync(dest, src, size, kind, stream);
766: } else {
767: cupmMemcpy(dest, src, size, kind);
768: }
770: // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
771: // (yet) or don't know the direction
772: if (kind == cupmMemcpyDeviceToHost) {
773: PetscLogGpuToCpu(size);
774: } else if (kind == cupmMemcpyHostToDevice) {
775: PetscLogCpuToGpu(size);
776: }
777: return 0;
778: }
780: template <typename D, typename S = D>
781: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemcpy(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind))
782: {
783: PetscCUPMMemcpyAsync(dest, src, n, kind);
784: return 0;
785: }
787: template <typename M>
788: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false))
789: {
790: static_assert(!std::is_void<M>::value, "");
792: if (PetscLikely(n)) {
793: const auto bytes = n * sizeof(M);
796: if (stream || use_async) {
797: cupmMemsetAsync(ptr, value, bytes, stream);
798: } else {
799: cupmMemset(ptr, value, bytes);
800: }
801: }
802: return 0;
803: }
805: template <typename M>
806: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n))
807: {
808: PetscCUPMMemsetAsync(ptr, value, n);
809: return 0;
810: }
812: // these we can transparently wrap, no need to namespace it to Petsc
813: template <typename M>
814: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeAsync(M &&ptr, cupmStream_t stream = nullptr))
815: {
816: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
818: if (ptr) {
819: auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
821: ptr = nullptr;
822: if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
823: }
824: return cupmSuccess;
825: }
827: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr)) { return interface_type::cupmFreeAsync(ptr, stream); }
829: template <typename M>
830: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFree(M &&ptr))
831: {
832: return cupmFreeAsync(std::forward<M>(ptr));
833: }
835: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFree(std::nullptr_t ptr)) { return cupmFreeAsync(ptr); }
837: template <typename M>
838: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeHost(M &&ptr))
839: {
840: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
841: const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
842: ptr = nullptr;
843: return cerr;
844: }
846: PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeHost(std::nullptr_t ptr)) { return interface_type::cupmFreeHost(ptr); }
848: // specific wrapper for device launch function, as the real function is a C routine and
849: // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
850: // boils down to the fact that ultimately we pass a
851: //
852: // void *args[] = {(void*)&kernel_args...};
853: //
854: // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
855: // happen to the kernel arguments so we must do it ourselves here. This function does this in
856: // 3 stages:
857: // 1. Enumerate the kernel arguments (cupmLaunchKernel)
858: // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
859: // expected by func() using the enumeration above (deduceKernelCall)
860: // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
861: // it. (interface_type::cupmLaunchKernel)
862: template <typename F, typename... Args>
863: PETSC_CXX_COMPAT_DECL(cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs))
864: {
865: return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
866: }
868: template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
869: PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs))
870: {
871: static_assert(block_size > 0, "");
872: static_assert(warp_size > 0, "");
873: // want block_size to be a multiple of the warp_size
874: static_assert(block_size % warp_size == 0, "");
875: const auto nthread = std::min(n, block_size);
876: const auto nblock = (n + block_size - 1) / block_size;
878: // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
879: // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
880: PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
881: cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...);
882: return 0;
883: }
885: private:
886: template <typename S, typename D, typename = void>
887: struct is_static_castable : std::false_type { };
889: template <typename S, typename D>
890: struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
892: template <typename D, typename S>
893: static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
894: {
895: return static_cast<D>(std::forward<S>(src));
896: }
898: template <typename D, typename S>
899: static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
900: {
901: return const_cast<D>(std::forward<S>(src));
902: }
904: template <typename F, typename... Args, std::size_t... Idx>
905: PETSC_CXX_COMPAT_DECL(cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs))
906: {
907: // clang-format off
908: return interface_type::template cupmLaunchKernel(
909: std::forward<F>(func),
910: std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
911: // can't static_cast() here since the function argument type may be cv-qualified, in
912: // which case we would need to const_cast(). But you can only const_cast()
913: // indirect types (pointers, references) and I don't want to add a
914: // static_cast_that_becomes_a_const_cast() SFINAE monster to this template mess. C-style
915: // casts luckily work here since it tries the following and uses the first one that
916: // succeeds:
917: // 1. const_cast()
918: // 2. static_cast()
919: // 3. static_cast() then const_cast()
920: // 4. reinterpret_cast()...
921: // hopefully we never get to reinterpret_cast() land
922: //(typename util::func_traits<F>::template arg<Idx>::type)(kernelArgs)...
923: cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
924: );
925: // clang-format on
926: }
927: };
929: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
930: PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
931: using base_name = ::Petsc::device::cupm::impl::Interface<T>; \
932: using typename base_name::cupmReal_t; \
933: using typename base_name::cupmScalar_t; \
934: using base_name::makeCupmScalar; \
935: using base_name::cupmScalarCast; \
936: using base_name::cupmRealCast; \
937: using base_name::PetscCUPMGetMemType; \
938: using base_name::PetscCUPMMemset; \
939: using base_name::PetscCUPMMemsetAsync; \
940: using base_name::PetscCUPMMalloc; \
941: using base_name::PetscCUPMMallocAsync; \
942: using base_name::PetscCUPMMallocHost; \
943: using base_name::PetscCUPMMemcpy; \
944: using base_name::PetscCUPMMemcpyAsync; \
945: using base_name::cupmFree; \
946: using base_name::cupmFreeAsync; \
947: using base_name::cupmFreeHost; \
948: using base_name::cupmLaunchKernel; \
949: using base_name::PetscCUPMLaunchKernel1D; \
950: using base_name::PetscDeviceCopyModeToCUPMMemcpyKind
952: } // namespace impl
954: } // namespace cupm
956: } // namespace device
958: } // namespace Petsc
960: #endif /* __cplusplus */
962: #endif /* PETSCCUPMINTERFACE_HPP */