issue/291/style: 根据实际情况将 cuda 改为 nvidia

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/291/style: 根据实际情况将 cuda 改为 nvidia
Signed-off-by: YdrMaster <ydrml@hotmail.com>
ceb57c2a · YdrMaster · d76a2607 · ceb57c2a · ceb57c2a · ceb57c2a
Commit ceb57c2a authored Jul 14, 2025 by YdrMaster
20 changed files
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -6,7 +6,7 @@
 #include "cpu/cpu_handle.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/cuda_handle.h"
+#include "nvidia/nvidia_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_API
 #include "bang/bang_handle.h"
@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);

--- a/src/infiniop/devices/cuda/cuda_common.cu
+++ b/src/infiniop/devices/cuda/cuda_common.cu
-#include "cuda_handle.cuh"
+#include "nvidia_handle.cuh"
-namespace device::cuda {
+namespace device::nvidia {
-Handle::Handle(infiniDevice_t device, int device_id)
+Handle::Handle(int device_id)
-    : InfiniopHandle{device, device_id},
+    : InfiniopHandle{INFINI_DEVICE_NVIDIA, device_id},
      _internal(std::make_shared<Handle::Internal>(device_id)) {}
 auto Handle::internal() const -> const std::shared_ptr<Internal> & {
@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
 }
 #endif
-namespace nvidia {
-Handle::Handle(int device_id)
-    : cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
 infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
    *handle_ptr = new Handle(device_id);
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace nvidia
+} // namespace device::nvidia
-} // namespace device::cuda
--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
 #ifndef __INFINIOP_CUDA_COMMON_CUH__
 #define __INFINIOP_CUDA_COMMON_CUH__
-#include "cuda_handle.cuh"
 #include "infinicore.h"
+#include "nvidia_handle.cuh"
-namespace device::cuda {
+namespace device::nvidia {
 #ifdef ENABLE_CUDNN_API
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
 #endif
-} // namespace device::cuda
+} // namespace device::nvidia
 #endif // __INFINIOP_CUDA_COMMON_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
@@ -3,7 +3,7 @@
 #include "../../../utils.h"
 #include "../pool.h"
-#include "cuda_handle.h"
+#include "nvidia_handle.h"
 #include <cublas_v2.h>
 #include <functional>
@@ -14,7 +14,7 @@
 #define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
 #define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
-namespace device::cuda {
+namespace device::nvidia {
 class Handle::Internal {
    Pool<cublasHandle_t> blas_handles;
@@ -48,6 +48,6 @@ public:
    int gridSizeZ() const;
 };
-} // namespace device::cuda
+} // namespace device::nvidia
 #endif // __INFINIOP_CUDA_HANDLE_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
@@ -4,30 +4,20 @@
 #include "../../handle.h"
 #include <memory>
-namespace device::cuda {
+namespace device::nvidia {
 struct Handle : public InfiniopHandle {
+    Handle(int device_id);
    class Internal;
    auto internal() const -> const std::shared_ptr<Internal> &;
-protected:
+public:
-    Handle(infiniDevice_t device, int device_id);
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
 private:
    std::shared_ptr<Internal> _internal;
 };
-namespace nvidia {
+} // namespace device::nvidia
-class Handle : public cuda::Handle {
-    Handle(int device_id);
-public:
-    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
-};
-} // namespace nvidia
-} // namespace device::cuda
 #endif // __INFINIOP_CUDA_HANDLE_H__
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -18,7 +18,7 @@
 using cuda_bfloat16 = nv_bfloat16;
 using cuda_bfloat162 = nv_bfloat162;
-namespace device::cuda {
+namespace device::nvidia {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -48,7 +48,7 @@ indexToOffset(
    }
    return res;
 }
-} // namespace device::cuda
+} // namespace device::nvidia
 __forceinline__ __device__ float
 exp_(const float val) {

--- a/src/infiniop/elementwise/elementwise.h
+++ b/src/infiniop/elementwise/elementwise.h
@@ -12,45 +12,45 @@
 #include <numeric>
 #include <vector>
-#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON)                      \
+#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                                 \
-                                                                                  \
+                                                                              \
-    namespace op::OP::NAMESPACE {                                                 \
+    namespace op::OP::NAMESPACE {                                             \
-    class Descriptor final : public InfiniopDescriptor {                          \
+    class Descriptor final : public InfiniopDescriptor {                      \
-        infiniDtype_t _dtype;                                                     \
+        infiniDtype_t _dtype;                                                 \
-        op::elementwise::ElementwiseInfo _info;                                   \
+        op::elementwise::ElementwiseInfo _info;                               \
-        std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
+        std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
-        size_t _workspace_size;                                                   \
+        size_t _workspace_size;                                               \
-                                                                                  \
+                                                                              \
-        Descriptor(                                                               \
+        Descriptor(                                                           \
-            infiniDtype_t dtype,                                                  \
+            infiniDtype_t dtype,                                              \
-            op::elementwise::ElementwiseInfo info,                                \
+            op::elementwise::ElementwiseInfo info,                            \
-            op::elementwise::KERNEL_COMMON::DeviceImpl *device_info,              \
+            op::elementwise::NAMESPACE::DeviceImpl *device_info,              \
-            size_t workspace_size,                                                \
+            size_t workspace_size,                                            \
-            infiniDevice_t device_type,                                           \
+            infiniDevice_t device_type,                                       \
-            int device_id)                                                        \
+            int device_id)                                                    \
-            : InfiniopDescriptor{device_type, device_id},                         \
+            : InfiniopDescriptor{device_type, device_id},                     \
-              _dtype(dtype),                                                      \
+              _dtype(dtype),                                                  \
-              _info(std::move(info)),                                             \
+              _info(std::move(info)),                                         \
-              _device_info(std::move(device_info)),                               \
+              _device_info(std::move(device_info)),                           \
-              _workspace_size(workspace_size) {}                                  \
+              _workspace_size(workspace_size) {}                              \
-                                                                                  \
+                                                                              \
-    public:                                                                       \
+    public:                                                                   \
-        ~Descriptor();                                                            \
+        ~Descriptor();                                                        \
-                                                                                  \
+                                                                              \
-        size_t workspaceSize() const { return _workspace_size; }                  \
+        size_t workspaceSize() const { return _workspace_size; }              \
-                                                                                  \
+                                                                              \
-        static infiniStatus_t create(                                             \
+        static infiniStatus_t create(                                         \
-            infiniopHandle_t handle,                                              \
+            infiniopHandle_t handle,                                          \
-            Descriptor **desc_ptr,                                                \
+            Descriptor **desc_ptr,                                            \
-            infiniopTensorDescriptor_t output_desc,                               \
+            infiniopTensorDescriptor_t output_desc,                           \
-            std::vector<infiniopTensorDescriptor_t> input_descs);                 \
+            std::vector<infiniopTensorDescriptor_t> input_descs);             \
-                                                                                  \
+                                                                              \
-        infiniStatus_t calculate(                                                 \
+        infiniStatus_t calculate(                                             \
-            void *workspace, size_t workspace_size,                               \
+            void *workspace, size_t workspace_size,                           \
-            void *output,                                                         \
+            void *output,                                                     \
-            std::vector<const void *> inputs,                                     \
+            std::vector<const void *> inputs,                                 \
-            void *stream) const;                                                  \
+            void *stream) const;                                              \
-    };                                                                            \
+    };                                                                        \
    }
 namespace op::elementwise {

--- a/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
@@ -2,11 +2,11 @@
 #define __INFINIOP_ELEMENTWISE_CUDA_H__
 #include "../../../utils.h"
-#include "../../devices/cuda/cuda_common.cuh"
+#include "../../devices/nvidia/nvidia_common.cuh"
-#include "../../devices/cuda/cuda_kernel_common.cuh"
+#include "../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "elementwise_cuda_api.cuh"
+#include "elementwise_nvidia_api.cuh"
-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {
 /**
 * @brief Casts an untyped device pointer to a typed pointer of type T.
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
 */
 __device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
                                                 const size_t *shape, const ptrdiff_t *strides) {
-    return is_contiguous ? idx : device::cuda::indexToOffset(idx, ndim, shape, strides);
+    return is_contiguous ? idx : device::nvidia::indexToOffset(idx, ndim, shape, strides);
 }
 /**
@@ -61,8 +61,8 @@ struct InputIndexer {
        return input_contiguous[input_id]
                 ? idx
                 : (input_broadcasted[input_id]
-                        ? device::cuda::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
+                        ? device::nvidia::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
-                        : device::cuda::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
+                        : device::nvidia::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
    }
 };
@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
 }
 struct DeviceImpl::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-    Opaque(const std::shared_ptr<device::cuda::Handle::Internal> &internal)
+    Opaque(const std::shared_ptr<device::nvidia::Handle::Internal> &internal)
        : internal(internal) {}
    /**
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
        std::forward<Args>(args)...);
 }
-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia
 #endif // __INFINIOP_ELEMENTWISE_CUDA_H__
--- a/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
@@ -3,7 +3,7 @@
 #include "../elementwise.h"
-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {
 /**
 * @brief Define the methods and info needed by CUDA to perform elementwise operation
@@ -77,7 +77,7 @@ public:
        void *stream,
        Args &&...args);
 };
-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia
 /**
 * @brief Define the process for initializing a Descriptor of an elementwise operation
@@ -88,22 +88,22 @@ public:
 * @param OUT_DESC       The output tensor descriptor.
 * @param INPUT_DESC_VEC A vector containing input tensor descriptors.
 */
-#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)          \
+#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)            \
-                                                                                             \
+                                                                                               \
-    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);   \
+    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);     \
-    CHECK_RESULT(info_result);                                                               \
+    CHECK_RESULT(info_result);                                                                 \
-    auto info = info_result.take();                                                          \
+    auto info = info_result.take();                                                            \
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);      \
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);        \
-                                                                                             \
+                                                                                               \
-    auto device_impl_result = op::elementwise::cuda::DeviceImpl::create(HANDLE->internal()); \
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(HANDLE->internal()); \
-    CHECK_RESULT(device_impl_result);                                                        \
+    CHECK_RESULT(device_impl_result);                                                          \
-                                                                                             \
+                                                                                               \
-    *desc_ptr = new Descriptor(                                                              \
+    *desc_ptr = new Descriptor(                                                                \
-        DTYPE,                                                                               \
+        DTYPE,                                                                                 \
-        std::move(info),                                                                     \
+        std::move(info),                                                                       \
-        std::move(device_impl_result.take()),                                                \
+        std::move(device_impl_result.take()),                                                  \
-        workspace_size,                                                                      \
+        workspace_size,                                                                        \
-        HANDLE->device,                                                                      \
+        HANDLE->device,                                                                        \
        HANDLE->device_id);
 #endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
--- a/src/infiniop/ops/add/cpu/add_cpu.h
+++ b/src/infiniop/ops/add/cpu/add_cpu.h
@@ -3,7 +3,7 @@
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
+ELEMENTWISE_DESCRIPTOR(add, cpu)
 namespace op::add::cpu {
 typedef struct AddOp {

--- a/src/infiniop/ops/add/metax/add_metax.h
+++ b/src/infiniop/ops/add/metax/add_metax.h
@@ -3,6 +3,6 @@
 #include "../../../elementwise/metax/elementwise_metax_api.h"
-ELEMENTWISE_DESCRIPTOR(add, metax, metax)
+ELEMENTWISE_DESCRIPTOR(add, metax)
 #endif // __ADD_METAX_API_H__
--- a/src/infiniop/ops/add/nvidia/add_nvidia.cu
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu
-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
 #include "../cuda/kernel.cuh"
 #include "add_nvidia.cuh"
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t out_desc,
    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = out_desc->dtype();
    const auto &a_desc = input_desc_vec.at(0);

--- a/src/infiniop/ops/add/nvidia/add_nvidia.cuh
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cuh
 #ifndef __ADD_CUDA_API_H__
 #define __ADD_CUDA_API_H__
-#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
+ELEMENTWISE_DESCRIPTOR(add, nvidia)
 #endif // __ADD_CUDA_API_H__
--- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
-#include "../../../devices/cuda/cuda_common.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
 #include "causal_softmax_nvidia.cuh"
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include <cub/block/block_reduce.cuh>
 #include "../../../reduce/cuda/reduce.cuh"
@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
 namespace op::causal_softmax::nvidia {
 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
 };
 Descriptor::~Descriptor() {
@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
    auto info = CausalSoftmaxInfo::create(y_desc, x_desc);
    CHECK_RESULT(info);
    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
        info.take(), 0, handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/clip/cpu/clip_cpu.h
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.h
@@ -4,7 +4,7 @@
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 #include "infiniop/ops/clip.h"
-ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu)
+ELEMENTWISE_DESCRIPTOR(clip, cpu)
 namespace op::clip::cpu {

--- a/src/infiniop/ops/clip/metax/clip_metax.h
+++ b/src/infiniop/ops/clip/metax/clip_metax.h
@@ -3,6 +3,6 @@
 #include "../../../elementwise/metax/elementwise_metax_api.h"
-ELEMENTWISE_DESCRIPTOR(clip, metax, metax)
+ELEMENTWISE_DESCRIPTOR(clip, metax)
 #endif // __CLIP_METAX_API_H__
--- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
 #include "../cuda/kernel.cuh"
 #include "clip_nvidia.cuh"
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t out_desc,
    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = out_desc->dtype();
    const auto &in_desc = input_desc_vec.at(0);

--- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
 #ifndef __CLIP_CUDA_API_H__
 #define __CLIP_CUDA_API_H__
-#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
+ELEMENTWISE_DESCRIPTOR(clip, nvidia)
 #endif // __CLIP_CUDA_API_H__
--- a/src/infiniop/ops/conv/info.h
+++ b/src/infiniop/ops/conv/info.h
@@ -6,7 +6,7 @@
 #include "../../tensor.h"
 #ifdef ENABLE_CUDA_API
-#include "../../devices/cuda/cuda_handle.cuh"
+#include "../../devices/nvidia/nvidia_handle.cuh"
 #endif
 namespace op::conv {

--- a/src/infiniop/ops/conv/cuda/conv_cuda.cu
+++ b/src/infiniop/ops/conv/cuda/conv_cuda.cu
-#include "../../../devices/cuda/cuda_common.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/cuda/cuda_handle.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
-#include "conv_cuda.cuh"
+#include "conv_nvidia.cuh"
-#ifdef ENABLE_CUDNN_API
 #define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
    do {                                                 \
@@ -22,11 +20,13 @@
        DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
    } while (0)
-namespace op::conv::cuda {
+namespace op::conv::nvidia {
 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+#ifdef ENABLE_CUDNN_API
    cudnnTensorDescriptor_t x_desc = nullptr;
    cudnnTensorDescriptor_t y_desc = nullptr;
    cudnnFilterDescriptor_t w_desc = nullptr;
@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
    cudnnActivationDescriptor_t act_desc = nullptr;
    cudnnConvolutionDescriptor_t conv_desc = nullptr;
    cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-    size_t workspace_size = 0;
+#endif
 private:
-    Opaque(std::shared_ptr<device::cuda::Handle::Internal> internal_ptr)
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
        : internal(internal_ptr) {}
+#ifdef ENABLE_CUDNN_API
    void initializeDimensionArrays(const ConvInfo &info,
                                   std::vector<int> &input_dims,
                                   std::vector<int> &output_dims,
@@ -108,11 +109,11 @@ private:
    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
                                    cudnnDataType_t &cudnn_data_type) const {
        if (data_type == INFINI_DTYPE_F16) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else if (data_type == INFINI_DTYPE_F32) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else if (data_type == INFINI_DTYPE_BF16) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else {
            return INFINI_STATUS_BAD_TENSOR_DTYPE;
        }
@@ -253,32 +254,42 @@ private:
        return INFINI_STATUS_SUCCESS;
    }
+#endif
 public:
    Opaque(Opaque &&other) noexcept
        : internal(std::move(other.internal)),
-          x_desc(other.x_desc),
+          workspace_size(other.workspace_size)
-          y_desc(other.y_desc),
+    // clang-format off
-          w_desc(other.w_desc),
+#ifdef ENABLE_CUDNN_API
-          b_desc(other.b_desc),
+          , x_desc(other.x_desc),
-          act_desc(other.act_desc),
+          , y_desc(other.y_desc),
-          conv_desc(other.conv_desc),
+          , w_desc(other.w_desc),
-          algo(other.algo),
+          , b_desc(other.b_desc),
-          workspace_size(other.workspace_size) {
+          , act_desc(other.act_desc),
+          , conv_desc(other.conv_desc),
+          , algo(other.algo)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
        other.x_desc = nullptr;
        other.y_desc = nullptr;
        other.w_desc = nullptr;
        other.b_desc = nullptr;
        other.act_desc = nullptr;
        other.conv_desc = nullptr;
+#endif
        other.workspace_size = 0;
    }
    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
    }
+#ifdef ENABLE_CUDNN_API
    infiniStatus_t initializeCudnnContext(ConvInfo &info,
                                          infiniDtype_t data_type,
                                          cudnnDataType_t compute_type) {
@@ -320,17 +331,22 @@ public:
        return INFINI_STATUS_SUCCESS;
    }
+#endif
    static inline utils::Result<Opaque> create(
-        std::shared_ptr<device::cuda::Handle::Internal> internal_ptr,
+        std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
        ConvInfo &info,
        infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
        Opaque opaque(internal_ptr);
        auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT);
        if (status != INFINI_STATUS_SUCCESS) {
            return status;
        }
        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_SUCCESS;
+#endif
    }
 };
@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
    const void *strides,
    const void *dilations,
    size_t n) {
-    auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = y_desc->dtype();
    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
        handle->device,
        handle->device_id);
    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_SUCCESS;
+#endif
 }
 infiniStatus_t Descriptor::calculate(
@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
    const void *w,
    const void *bias,
    void *stream) const {
+#ifdef ENABLE_CUDNN_API
    const float alpha = 1.0f, beta = 0.0f;
    if (bias != nullptr) {
        CHECK_STATUS(_opaque->internal->useCudnn(
@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
    }
    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_SUCCESS;
+#endif
 }
-} // namespace op::conv::cuda
+} // namespace op::conv::nvidia
-#endif // ENABLE_CUDNN_API