Merge pull request #325 from YdrMaster/main

issue/291/style: 根据实际情况将 cuda 改为 nvidia

Merge pull request #325 from YdrMaster/main
issue/291/style: 根据实际情况将 cuda 改为 nvidia
a9dbb97a · PanZezhong1725 · GitHub · d76a2607 · ceb57c2a · a9dbb97a
Unverified Commit a9dbb97a authored Jul 14, 2025 by PanZezhong1725 Committed by GitHub Jul 14, 2025
20 changed files
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -6,7 +6,7 @@
 #include "cpu/cpu_handle.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/cuda_handle.h"
+#include "nvidia/nvidia_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_API
 #include "bang/bang_handle.h"
@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);

--- a/src/infiniop/devices/cuda/cuda_common.cu
+++ b/src/infiniop/devices/cuda/cuda_common.cu
-#include "cuda_handle.cuh"
+#include "nvidia_handle.cuh"

-namespace device::cuda {
+namespace device::nvidia {

-Handle::Handle(infiniDevice_t device, int device_id)
-    : InfiniopHandle{device, device_id},
+Handle::Handle(int device_id)
+    : InfiniopHandle{INFINI_DEVICE_NVIDIA, device_id},
      _internal(std::make_shared<Handle::Internal>(device_id)) {}

 auto Handle::internal() const -> const std::shared_ptr<Internal> & {
@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
 }
 #endif

-namespace nvidia {
-
-Handle::Handle(int device_id)
-    : cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
-
 infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
    *handle_ptr = new Handle(device_id);
    return INFINI_STATUS_SUCCESS;
 }

-} // namespace nvidia
-
-} // namespace device::cuda
+} // namespace device::nvidia
--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
 #ifndef __INFINIOP_CUDA_COMMON_CUH__
 #define __INFINIOP_CUDA_COMMON_CUH__

-#include "cuda_handle.cuh"
 #include "infinicore.h"
+#include "nvidia_handle.cuh"

-namespace device::cuda {
+namespace device::nvidia {

 #ifdef ENABLE_CUDNN_API
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
 #endif

-} // namespace device::cuda
+} // namespace device::nvidia

 #endif // __INFINIOP_CUDA_COMMON_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
@@ -3,7 +3,7 @@

 #include "../../../utils.h"
 #include "../pool.h"
-#include "cuda_handle.h"
+#include "nvidia_handle.h"
 #include <cublas_v2.h>
 #include <functional>

@@ -14,7 +14,7 @@
 #define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
 #define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)

-namespace device::cuda {
+namespace device::nvidia {

 class Handle::Internal {
    Pool<cublasHandle_t> blas_handles;
@@ -48,6 +48,6 @@ public:
    int gridSizeZ() const;
 };

-} // namespace device::cuda
+} // namespace device::nvidia

 #endif // __INFINIOP_CUDA_HANDLE_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
@@ -4,30 +4,20 @@
 #include "../../handle.h"
 #include <memory>

-namespace device::cuda {
+namespace device::nvidia {

 struct Handle : public InfiniopHandle {
+    Handle(int device_id);
    class Internal;
    auto internal() const -> const std::shared_ptr<Internal> &;

-protected:
-    Handle(infiniDevice_t device, int device_id);
+public:
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);

 private:
    std::shared_ptr<Internal> _internal;
 };

-namespace nvidia {
-
-class Handle : public cuda::Handle {
-    Handle(int device_id);
-
-public:
-    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
-};
-
-} // namespace nvidia
-
-} // namespace device::cuda
+} // namespace device::nvidia

 #endif // __INFINIOP_CUDA_HANDLE_H__
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -18,7 +18,7 @@
 using cuda_bfloat16 = nv_bfloat16;
 using cuda_bfloat162 = nv_bfloat162;

-namespace device::cuda {
+namespace device::nvidia {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -48,7 +48,7 @@ indexToOffset(
    }
    return res;
 }
-} // namespace device::cuda
+} // namespace device::nvidia

 __forceinline__ __device__ float
 exp_(const float val) {

--- a/src/infiniop/elementwise/elementwise.h
+++ b/src/infiniop/elementwise/elementwise.h
@@ -12,45 +12,45 @@
 #include <numeric>
 #include <vector>

-#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON)                      \
-                                                                                  \
-    namespace op::OP::NAMESPACE {                                                 \
-    class Descriptor final : public InfiniopDescriptor {                          \
-        infiniDtype_t _dtype;                                                     \
-        op::elementwise::ElementwiseInfo _info;                                   \
-        std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
-        size_t _workspace_size;                                                   \
-                                                                                  \
-        Descriptor(                                                               \
-            infiniDtype_t dtype,                                                  \
-            op::elementwise::ElementwiseInfo info,                                \
-            op::elementwise::KERNEL_COMMON::DeviceImpl *device_info,              \
-            size_t workspace_size,                                                \
-            infiniDevice_t device_type,                                           \
-            int device_id)                                                        \
-            : InfiniopDescriptor{device_type, device_id},                         \
-              _dtype(dtype),                                                      \
-              _info(std::move(info)),                                             \
-              _device_info(std::move(device_info)),                               \
-              _workspace_size(workspace_size) {}                                  \
-                                                                                  \
-    public:                                                                       \
-        ~Descriptor();                                                            \
-                                                                                  \
-        size_t workspaceSize() const { return _workspace_size; }                  \
-                                                                                  \
-        static infiniStatus_t create(                                             \
-            infiniopHandle_t handle,                                              \
-            Descriptor **desc_ptr,                                                \
-            infiniopTensorDescriptor_t output_desc,                               \
-            std::vector<infiniopTensorDescriptor_t> input_descs);                 \
-                                                                                  \
-        infiniStatus_t calculate(                                                 \
-            void *workspace, size_t workspace_size,                               \
-            void *output,                                                         \
-            std::vector<const void *> inputs,                                     \
-            void *stream) const;                                                  \
-    };                                                                            \
+#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                                 \
+                                                                              \
+    namespace op::OP::NAMESPACE {                                             \
+    class Descriptor final : public InfiniopDescriptor {                      \
+        infiniDtype_t _dtype;                                                 \
+        op::elementwise::ElementwiseInfo _info;                               \
+        std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
+        size_t _workspace_size;                                               \
+                                                                              \
+        Descriptor(                                                           \
+            infiniDtype_t dtype,                                              \
+            op::elementwise::ElementwiseInfo info,                            \
+            op::elementwise::NAMESPACE::DeviceImpl *device_info,              \
+            size_t workspace_size,                                            \
+            infiniDevice_t device_type,                                       \
+            int device_id)                                                    \
+            : InfiniopDescriptor{device_type, device_id},                     \
+              _dtype(dtype),                                                  \
+              _info(std::move(info)),                                         \
+              _device_info(std::move(device_info)),                           \
+              _workspace_size(workspace_size) {}                              \
+                                                                              \
+    public:                                                                   \
+        ~Descriptor();                                                        \
+                                                                              \
+        size_t workspaceSize() const { return _workspace_size; }              \
+                                                                              \
+        static infiniStatus_t create(                                         \
+            infiniopHandle_t handle,                                          \
+            Descriptor **desc_ptr,                                            \
+            infiniopTensorDescriptor_t output_desc,                           \
+            std::vector<infiniopTensorDescriptor_t> input_descs);             \
+                                                                              \
+        infiniStatus_t calculate(                                             \
+            void *workspace, size_t workspace_size,                           \
+            void *output,                                                     \
+            std::vector<const void *> inputs,                                 \
+            void *stream) const;                                              \
+    };                                                                        \
    }

 namespace op::elementwise {

--- a/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
@@ -2,11 +2,11 @@
 #define __INFINIOP_ELEMENTWISE_CUDA_H__

 #include "../../../utils.h"
-#include "../../devices/cuda/cuda_common.cuh"
-#include "../../devices/cuda/cuda_kernel_common.cuh"
-#include "elementwise_cuda_api.cuh"
+#include "../../devices/nvidia/nvidia_common.cuh"
+#include "../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "elementwise_nvidia_api.cuh"

-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {

 /**
 * @brief Casts an untyped device pointer to a typed pointer of type T.
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
 */
 __device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
                                                 const size_t *shape, const ptrdiff_t *strides) {
-    return is_contiguous ? idx : device::cuda::indexToOffset(idx, ndim, shape, strides);
+    return is_contiguous ? idx : device::nvidia::indexToOffset(idx, ndim, shape, strides);
 }

 /**
@@ -61,8 +61,8 @@ struct InputIndexer {
        return input_contiguous[input_id]
                 ? idx
                 : (input_broadcasted[input_id]
-                        ? device::cuda::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
-                        : device::cuda::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
+                        ? device::nvidia::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
+                        : device::nvidia::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
    }
 };

@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
 }

 struct DeviceImpl::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;

-    Opaque(const std::shared_ptr<device::cuda::Handle::Internal> &internal)
+    Opaque(const std::shared_ptr<device::nvidia::Handle::Internal> &internal)
        : internal(internal) {}

    /**
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
        std::forward<Args>(args)...);
 }

-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia

 #endif // __INFINIOP_ELEMENTWISE_CUDA_H__
--- a/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
@@ -3,7 +3,7 @@

 #include "../elementwise.h"

-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {

 /**
 * @brief Define the methods and info needed by CUDA to perform elementwise operation
@@ -77,7 +77,7 @@ public:
        void *stream,
        Args &&...args);
 };
-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia

 /**
 * @brief Define the process for initializing a Descriptor of an elementwise operation
@@ -88,22 +88,22 @@ public:
 * @param OUT_DESC       The output tensor descriptor.
 * @param INPUT_DESC_VEC A vector containing input tensor descriptors.
 */
-#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)          \
-                                                                                             \
-    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);   \
-    CHECK_RESULT(info_result);                                                               \
-    auto info = info_result.take();                                                          \
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);      \
-                                                                                             \
-    auto device_impl_result = op::elementwise::cuda::DeviceImpl::create(HANDLE->internal()); \
-    CHECK_RESULT(device_impl_result);                                                        \
-                                                                                             \
-    *desc_ptr = new Descriptor(                                                              \
-        DTYPE,                                                                               \
-        std::move(info),                                                                     \
-        std::move(device_impl_result.take()),                                                \
-        workspace_size,                                                                      \
-        HANDLE->device,                                                                      \
+#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)            \
+                                                                                               \
+    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);     \
+    CHECK_RESULT(info_result);                                                                 \
+    auto info = info_result.take();                                                            \
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);        \
+                                                                                               \
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(HANDLE->internal()); \
+    CHECK_RESULT(device_impl_result);                                                          \
+                                                                                               \
+    *desc_ptr = new Descriptor(                                                                \
+        DTYPE,                                                                                 \
+        std::move(info),                                                                       \
+        std::move(device_impl_result.take()),                                                  \
+        workspace_size,                                                                        \
+        HANDLE->device,                                                                        \
        HANDLE->device_id);

 #endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
--- a/src/infiniop/ops/add/cpu/add_cpu.h
+++ b/src/infiniop/ops/add/cpu/add_cpu.h
@@ -3,7 +3,7 @@

 #include "../../../elementwise/cpu/elementwise_cpu.h"

-ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
+ELEMENTWISE_DESCRIPTOR(add, cpu)

 namespace op::add::cpu {
 typedef struct AddOp {

--- a/src/infiniop/ops/add/metax/add_metax.h
+++ b/src/infiniop/ops/add/metax/add_metax.h
@@ -3,6 +3,6 @@

 #include "../../../elementwise/metax/elementwise_metax_api.h"

-ELEMENTWISE_DESCRIPTOR(add, metax, metax)
+ELEMENTWISE_DESCRIPTOR(add, metax)

 #endif // __ADD_METAX_API_H__
--- a/src/infiniop/ops/add/nvidia/add_nvidia.cu
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu
-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"

 #include "../cuda/kernel.cuh"
 #include "add_nvidia.cuh"
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t out_desc,
    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

-    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = out_desc->dtype();

    const auto &a_desc = input_desc_vec.at(0);

--- a/src/infiniop/ops/add/nvidia/add_nvidia.cuh
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cuh
 #ifndef __ADD_CUDA_API_H__
 #define __ADD_CUDA_API_H__

-#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"

-ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
+ELEMENTWISE_DESCRIPTOR(add, nvidia)

 #endif // __ADD_CUDA_API_H__
--- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
-#include "../../../devices/cuda/cuda_common.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
 #include "causal_softmax_nvidia.cuh"

-#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include <cub/block/block_reduce.cuh>

 #include "../../../reduce/cuda/reduce.cuh"
@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
 namespace op::causal_softmax::nvidia {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
 };

 Descriptor::~Descriptor() {
@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
    auto info = CausalSoftmaxInfo::create(y_desc, x_desc);
    CHECK_RESULT(info);
    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
        info.take(), 0, handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/clip/cpu/clip_cpu.h
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.h
@@ -4,7 +4,7 @@
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 #include "infiniop/ops/clip.h"

-ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu)
+ELEMENTWISE_DESCRIPTOR(clip, cpu)

 namespace op::clip::cpu {


--- a/src/infiniop/ops/clip/metax/clip_metax.h
+++ b/src/infiniop/ops/clip/metax/clip_metax.h
@@ -3,6 +3,6 @@

 #include "../../../elementwise/metax/elementwise_metax_api.h"

-ELEMENTWISE_DESCRIPTOR(clip, metax, metax)
+ELEMENTWISE_DESCRIPTOR(clip, metax)

 #endif // __CLIP_METAX_API_H__
--- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"

 #include "../cuda/kernel.cuh"
 #include "clip_nvidia.cuh"
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t out_desc,
    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

-    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = out_desc->dtype();

    const auto &in_desc = input_desc_vec.at(0);

--- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
 #ifndef __CLIP_CUDA_API_H__
 #define __CLIP_CUDA_API_H__

-#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"

-ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
+ELEMENTWISE_DESCRIPTOR(clip, nvidia)

 #endif // __CLIP_CUDA_API_H__
--- a/src/infiniop/ops/conv/info.h
+++ b/src/infiniop/ops/conv/info.h
@@ -6,7 +6,7 @@
 #include "../../tensor.h"

 #ifdef ENABLE_CUDA_API
-#include "../../devices/cuda/cuda_handle.cuh"
+#include "../../devices/nvidia/nvidia_handle.cuh"
 #endif

 namespace op::conv {

--- a/src/infiniop/ops/conv/cuda/conv_cuda.cu
+++ b/src/infiniop/ops/conv/cuda/conv_cuda.cu
-#include "../../../devices/cuda/cuda_common.cuh"
-#include "../../../devices/cuda/cuda_handle.cuh"
-#include "conv_cuda.cuh"
-
-#ifdef ENABLE_CUDNN_API
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "conv_nvidia.cuh"

 #define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
    do {                                                 \
@@ -22,11 +20,13 @@
        DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
    } while (0)

-namespace op::conv::cuda {
+namespace op::conv::nvidia {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;

+#ifdef ENABLE_CUDNN_API
    cudnnTensorDescriptor_t x_desc = nullptr;
    cudnnTensorDescriptor_t y_desc = nullptr;
    cudnnFilterDescriptor_t w_desc = nullptr;
@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
    cudnnActivationDescriptor_t act_desc = nullptr;
    cudnnConvolutionDescriptor_t conv_desc = nullptr;
    cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-    size_t workspace_size = 0;
+#endif

 private:
-    Opaque(std::shared_ptr<device::cuda::Handle::Internal> internal_ptr)
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
        : internal(internal_ptr) {}

+#ifdef ENABLE_CUDNN_API
    void initializeDimensionArrays(const ConvInfo &info,
                                   std::vector<int> &input_dims,
                                   std::vector<int> &output_dims,
@@ -108,11 +109,11 @@ private:
    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
                                    cudnnDataType_t &cudnn_data_type) const {
        if (data_type == INFINI_DTYPE_F16) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else if (data_type == INFINI_DTYPE_F32) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else if (data_type == INFINI_DTYPE_BF16) {
-            cudnn_data_type = device::cuda::getCudnnDtype(data_type);
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
        } else {
            return INFINI_STATUS_BAD_TENSOR_DTYPE;
        }
@@ -253,32 +254,42 @@ private:

        return INFINI_STATUS_SUCCESS;
    }
+#endif

 public:
    Opaque(Opaque &&other) noexcept
        : internal(std::move(other.internal)),
-          x_desc(other.x_desc),
-          y_desc(other.y_desc),
-          w_desc(other.w_desc),
-          b_desc(other.b_desc),
-          act_desc(other.act_desc),
-          conv_desc(other.conv_desc),
-          algo(other.algo),
-          workspace_size(other.workspace_size) {
-
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+          , x_desc(other.x_desc),
+          , y_desc(other.y_desc),
+          , w_desc(other.w_desc),
+          , b_desc(other.b_desc),
+          , act_desc(other.act_desc),
+          , conv_desc(other.conv_desc),
+          , algo(other.algo)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
        other.x_desc = nullptr;
        other.y_desc = nullptr;
        other.w_desc = nullptr;
        other.b_desc = nullptr;
        other.act_desc = nullptr;
        other.conv_desc = nullptr;
+#endif
        other.workspace_size = 0;
    }

    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
    }

+#ifdef ENABLE_CUDNN_API
    infiniStatus_t initializeCudnnContext(ConvInfo &info,
                                          infiniDtype_t data_type,
                                          cudnnDataType_t compute_type) {
@@ -320,17 +331,22 @@ public:

        return INFINI_STATUS_SUCCESS;
    }
+#endif

    static inline utils::Result<Opaque> create(
-        std::shared_ptr<device::cuda::Handle::Internal> internal_ptr,
+        std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
        ConvInfo &info,
        infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
        Opaque opaque(internal_ptr);
        auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT);
        if (status != INFINI_STATUS_SUCCESS) {
            return status;
        }
        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_SUCCESS;
+#endif
    }
 };

@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
    const void *strides,
    const void *dilations,
    size_t n) {
-    auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
    auto dtype = y_desc->dtype();

    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
        handle->device,
        handle->device_id);
    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_SUCCESS;
+#endif
 }

 infiniStatus_t Descriptor::calculate(
@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
    const void *w,
    const void *bias,
    void *stream) const {
+#ifdef ENABLE_CUDNN_API
    const float alpha = 1.0f, beta = 0.0f;
    if (bias != nullptr) {
        CHECK_STATUS(_opaque->internal->useCudnn(
@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
    }

    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_SUCCESS;
+#endif
 }
-} // namespace op::conv::cuda
-
-#endif // ENABLE_CUDNN_API
+} // namespace op::conv::nvidia