Merge branch 'main' into issue/300

0166515c · PanZezhong1725 · GitHub · f0300ff3 · a23c4d13 · 0166515c
Unverified Commit 0166515c authored Aug 07, 2025 by PanZezhong1725 Committed by GitHub Aug 07, 2025
20 changed files
--- a/src/infiniop-test/include/tensor.hpp
+++ b/src/infiniop-test/include/tensor.hpp
@@ -6,6 +6,8 @@

 inline infiniDtype_t ggmlTypeToInfiniType(GGML_TYPE type) {
    switch (type) {
+    case GGML_TYPE_Q8_K:
+        return INFINI_DTYPE_BOOL;
    case GGML_TYPE_I8:
        return INFINI_DTYPE_I8;
    case GGML_TYPE_I16:
@@ -14,10 +16,10 @@ inline infiniDtype_t ggmlTypeToInfiniType(GGML_TYPE type) {
        return INFINI_DTYPE_I32;
    case GGML_TYPE_I64:
        return INFINI_DTYPE_I64;
-    case GGML_TYPE_F16:
-        return INFINI_DTYPE_F16;
    case GGML_TYPE_BF16:
        return INFINI_DTYPE_BF16;
+    case GGML_TYPE_F16:
+        return INFINI_DTYPE_F16;
    case GGML_TYPE_F32:
        return INFINI_DTYPE_F32;
    case GGML_TYPE_F64:

--- a/src/infiniop-test/include/utils.hpp
+++ b/src/infiniop-test/include/utils.hpp
@@ -9,12 +9,16 @@

 inline double getVal(void *ptr, GGML_TYPE ggml_type) {
    switch (ggml_type) {
+    case GGML_TYPE_BF16:
+        return utils::cast<float>(*(bf16_t *)ptr);
    case GGML_TYPE_F16:
-        return utils::cast<double>(*(fp16_t *)ptr);
+        return utils::cast<float>(*(fp16_t *)ptr);
    case GGML_TYPE_F32:
        return *(float *)ptr;
    case GGML_TYPE_F64:
        return *(double *)ptr;
+    case GGML_TYPE_Q8_K:
+        return *(bool *)ptr;
    case GGML_TYPE_I8:
        return *(int8_t *)ptr;
    case GGML_TYPE_I16:
@@ -30,12 +34,16 @@ inline double getVal(void *ptr, GGML_TYPE ggml_type) {

 inline size_t ggmlSizeOf(GGML_TYPE ggml_type) {
    switch (ggml_type) {
+    case GGML_TYPE_BF16:
+        return sizeof(bf16_t);
    case GGML_TYPE_F16:
        return sizeof(fp16_t);
    case GGML_TYPE_F32:
        return sizeof(float);
    case GGML_TYPE_F64:
        return sizeof(double);
+    case GGML_TYPE_Q8_K:
+        return sizeof(bool);
    case GGML_TYPE_I8:
        return sizeof(int8_t);
    case GGML_TYPE_I16:

--- a/src/infiniop-test/src/tensor.cpp
+++ b/src/infiniop-test/src/tensor.cpp
 #include "tensor.hpp"
+#include "gguf.hpp"
 #include "utils.hpp"
 #include <cstring>
 #include <infinirt.h>
@@ -19,6 +20,40 @@ void printData(const T *data, const std::vector<size_t> &shape, const std::vecto
    }
 }

+// The type int8_t is represented by signed char, with a range of –128 to 127.
+// It may contain non-printable characters and thus cannot be printed directly.
+template <>
+void printData(const int8_t *data, const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << static_cast<int>(*(data + i * strides[dim])) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            printData(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+
+template <>
+void printData(const bf16_t *data, const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << utils::cast<float>(*(data + i * strides[dim])) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            printData(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+
 template <>
 void printData(const fp16_t *data, const std::vector<size_t> &shape,
               const std::vector<ptrdiff_t> &strides, size_t dim) {
@@ -26,6 +61,7 @@ void printData(const fp16_t *data, const std::vector<size_t> &shape,
        for (size_t i = 0; i < shape[dim]; i++) {
            std::cout << utils::cast<float>(*(data + i * strides[dim])) << " ";
        }
+        std::cout << std::endl;
    } else if (dim < shape.size() - 1) {
        for (size_t i = 0; i < shape[dim]; i++) {
            printData(data + i * strides[dim], shape, strides, dim + 1);
@@ -227,6 +263,8 @@ void Tensor::debug() const {
    auto tensor = to(INFINI_DEVICE_CPU, 0);
    std::cout << "Tensor: " << tensor->info() << std::endl;
    switch (_ggml_type) {
+    case GGML_TYPE_BF16:
+        printData((bf16_t *)(tensor->data()), _shape, _strides, 0);
    case GGML_TYPE_F16:
        printData((fp16_t *)(tensor->data()), _shape, _strides, 0);
        break;
@@ -236,6 +274,9 @@ void Tensor::debug() const {
    case GGML_TYPE_F64:
        printData((double *)(tensor->data()), _shape, _strides, 0);
        break;
+    case GGML_TYPE_Q8_K:
+        printData((bool *)(tensor->data()), _shape, _strides, 0);
+        break;
    case GGML_TYPE_I8:
        printData((int8_t *)(tensor->data()), _shape, _strides, 0);
        break;
@@ -245,6 +286,9 @@ void Tensor::debug() const {
    case GGML_TYPE_I32:
        printData((int32_t *)(tensor->data()), _shape, _strides, 0);
        break;
+    case GGML_TYPE_I64:
+        printData((int64_t *)(tensor->data()), _shape, _strides, 0);
+        break;
    default:
        std::cout << "Unsupported GGML type" << std::endl;
        break;

--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -5,8 +5,8 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/cpu_handle.h"
 #endif
-#ifdef ENABLE_CUDA_API
-#include "cuda/cuda_handle.h"
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/nvidia_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_API
 #include "bang/bang_handle.h"
@@ -21,7 +21,7 @@
 #include "kunlun/kunlun_handle.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/maca_handle.h"
+#include "metax/metax_handle.h"
 #endif

 __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
@@ -41,8 +41,11 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, iluvatar);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
@@ -57,7 +60,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif

    default:
@@ -78,8 +81,11 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, iluvatar);
 #endif
 #ifdef ENABLE_CAMBRICON_API
        DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
@@ -94,7 +100,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/devices/maca/common_maca.h
+++ b/src/infiniop/devices/maca/common_maca.h
 #include "../../../utils.h"
 #include "../pool.h"
-#include "maca_handle.h"
+#include "metax_handle.h"
 #include <hcblas/hcblas.h>
 #include <hcdnn/hcdnn.h>
 #include <memory>
@@ -8,7 +8,7 @@
 #define CHECK_MCBLAS(API) CHECK_INTERNAL(API, HCBLAS_STATUS_SUCCESS)
 #define CHECK_MCDNN(API) CHECK_INTERNAL(API, HCDNN_STATUS_SUCCESS)

-namespace device::maca {
+namespace device::metax {

 class Handle::Internal {
    Pool<hcblasHandle_t> mcblas_handles;
@@ -39,4 +39,4 @@ public:

 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt);

-} // namespace device::maca
+} // namespace device::metax
--- a/src/infiniop/devices/maca/maca_handle.cc
+++ b/src/infiniop/devices/maca/maca_handle.cc
-#include "common_maca.h"
+#include "metax_common.h"

-namespace device::maca {
+namespace device::metax {
 Handle::Handle(infiniDevice_t device, int device_id)
    : InfiniopHandle{device, device_id},
      _internal(std::make_shared<Handle::Internal>(device_id)) {}
@@ -83,4 +83,4 @@ infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
    return INFINI_STATUS_SUCCESS;
 }

-} // namespace device::maca
+} // namespace device::metax
--- a/src/infiniop/devices/maca/maca_handle.h
+++ b/src/infiniop/devices/maca/maca_handle.h
-#ifndef __INFINIOP_MACA_HANDLE_H__
-#define __INFINIOP_MACA_HANDLE_H__
+#ifndef __INFINIOP_METAX_HANDLE_H__
+#define __INFINIOP_METAX_HANDLE_H__

 #include "../../handle.h"
 #include <memory>

-namespace device::maca {
+namespace device::metax {
 struct Handle : public InfiniopHandle {
    Handle(int device_id);
    class Internal;
@@ -20,6 +20,6 @@ private:
    std::shared_ptr<Internal> _internal;
 };

-} // namespace device::maca
+} // namespace device::metax

-#endif // __INFINIOP_MACA_HANDLE_H__
+#endif // __INFINIOP_METAX_HANDLE_H__
--- a/src/infiniop/devices/maca/maca_kernel_common.h
+++ b/src/infiniop/devices/maca/maca_kernel_common.h
-#define INFINIOP_MACA_KERNEL __global__ void
-// Posible maximum number of threads per block for MACA architectures
+#define INFINIOP_METAX_KERNEL __global__ void
+
+// Posible maximum number of threads per block for METAX architectures
 // Used for picking correct kernel launch configuration
-#define MACA_BLOCK_SIZE_1024 1024
-#define MACA_BLOCK_SIZE_512 512
+#define METAX_BLOCK_SIZE_1024 1024
+#define METAX_BLOCK_SIZE_512 512
+
+#define CHECK_METAX(API) CHECK_INTERNAL(API, hcSuccess)

-#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
+using cuda_bfloat16 = hpcc_bfloat16;
+using cuda_bfloat162 = hpcc_bfloat162;

-namespace device::maca {
+namespace device::metax {

 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
@@ -37,10 +41,8 @@ indexToOffset(
    }
    return res;
 }
-} // namespace device::maca
+} // namespace device::metax

-#ifdef ENABLE_MACA_API
-#include <maca_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
    return expf(val);
@@ -48,7 +50,7 @@ exp_(const float val) {

 __forceinline__ __device__ long double
 exp_(const long double val) {
-    return expl(val);
+    return exp(val);
 }

 __forceinline__ __device__ double
@@ -60,4 +62,8 @@ __forceinline__ __device__ __half
 exp_(const __half x) {
    return hexp(x);
 }
-#endif
+
+__forceinline__ __device__ __hpcc_bfloat16
+exp_(const __hpcc_bfloat16 x) {
+    return hexp(x);
+}
--- a/src/infiniop/devices/cuda/cuda_common.cu
+++ b/src/infiniop/devices/cuda/cuda_common.cu
-#include "cuda_handle.cuh"
+#include "nvidia_handle.cuh"

-namespace device::cuda {
+namespace device {
+
+namespace nvidia {

 Handle::Handle(infiniDevice_t device, int device_id)
    : InfiniopHandle{device, device_id},
@@ -34,6 +36,7 @@ infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasH
    return INFINI_STATUS_SUCCESS;
 }

+#ifdef ENABLE_CUDNN_API
 infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
    auto handle = dnn_handles.pop();
    if (!handle) {
@@ -44,6 +47,7 @@ infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHan
    dnn_handles.push(std::move(*handle));
    return INFINI_STATUS_SUCCESS;
 }
+#endif

 int Handle::Internal::warpSize() const { return _warp_size; }
 int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
@@ -54,6 +58,7 @@ int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
 int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
 int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }

+#ifdef ENABLE_CUDNN_API
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
    switch (dt) {
    case INFINI_DTYPE_F16:
@@ -68,7 +73,7 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
        return CUDNN_DATA_INT8;
    case INFINI_DTYPE_I32:
        return CUDNN_DATA_INT32;
-#ifndef ENABLE_ILUVATAR_CUDA_API
+#ifndef ENABLE_ILUVATAR_API
    case INFINI_DTYPE_I64:
        return CUDNN_DATA_INT64;
 #endif
@@ -78,17 +83,25 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
        return CUDNN_DATA_FLOAT;
    }
 }
+#endif

-namespace nvidia {
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
+    *handle_ptr = new Handle(INFINI_DEVICE_NVIDIA, device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace nvidia
+
+namespace iluvatar {

 Handle::Handle(int device_id)
-    : cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
+    : nvidia::Handle(INFINI_DEVICE_ILUVATAR, device_id) {}

 infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
    *handle_ptr = new Handle(device_id);
    return INFINI_STATUS_SUCCESS;
 }

-} // namespace nvidia
+} // namespace iluvatar

-} // namespace device::cuda
+} // namespace device
--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
 #ifndef __INFINIOP_CUDA_COMMON_CUH__
 #define __INFINIOP_CUDA_COMMON_CUH__

-#include "cuda_handle.cuh"
 #include "infinicore.h"
+#include "nvidia_handle.cuh"

-namespace device::cuda {
+namespace device::nvidia {

+#ifdef ENABLE_CUDNN_API
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
+#endif

-} // namespace device::cuda
+} // namespace device::nvidia

 #endif // __INFINIOP_CUDA_COMMON_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
@@ -3,19 +3,24 @@

 #include "../../../utils.h"
 #include "../pool.h"
-#include "cuda_handle.h"
+#include "nvidia_handle.h"
 #include <cublas_v2.h>
-#include <cudnn.h>
 #include <functional>

+#ifdef ENABLE_CUDNN_API
+#include <cudnn.h>
+#endif
+
 #define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
 #define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)

-namespace device::cuda {
+namespace device::nvidia {

 class Handle::Internal {
    Pool<cublasHandle_t> blas_handles;
+#ifdef ENABLE_CUDNN_API
    Pool<cudnnHandle_t> dnn_handles;
+#endif

    int _warp_size,
        _max_threads_per_block,
@@ -29,7 +34,9 @@ public:
    Internal(int);

    infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
+#ifdef ENABLE_CUDNN_API
    infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
+#endif

    int warpSize() const;
    int maxThreadsPerBlock() const;
@@ -41,6 +48,6 @@ public:
    int gridSizeZ() const;
 };

-} // namespace device::cuda
+} // namespace device::nvidia

 #endif // __INFINIOP_CUDA_HANDLE_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
@@ -4,7 +4,9 @@
 #include "../../handle.h"
 #include <memory>

-namespace device::cuda {
+namespace device {
+
+namespace nvidia {

 struct Handle : public InfiniopHandle {
    class Internal;
@@ -13,21 +15,26 @@ struct Handle : public InfiniopHandle {
 protected:
    Handle(infiniDevice_t device, int device_id);

+public:
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
+
 private:
    std::shared_ptr<Internal> _internal;
 };

-namespace nvidia {
+} // namespace nvidia

-class Handle : public cuda::Handle {
+namespace iluvatar {
+
+struct Handle : public nvidia::Handle {
    Handle(int device_id);

 public:
    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
 };

-} // namespace nvidia
+} // namespace iluvatar

-} // namespace device::cuda
+} // namespace device

 #endif // __INFINIOP_CUDA_HANDLE_H__
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -4,6 +4,9 @@
 #define INFINIOP_CUDA_KERNEL __global__ void
 #endif

+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
 #define CUDA_BLOCK_SIZE_4096 4096
@@ -12,8 +15,10 @@

 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)

-namespace device::cuda {
+using cuda_bfloat16 = nv_bfloat16;
+using cuda_bfloat162 = nv_bfloat162;

+namespace device::nvidia {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -43,16 +48,14 @@ indexToOffset(
    }
    return res;
 }
-} // namespace device::cuda
+} // namespace device::nvidia

-#ifdef ENABLE_CUDA_API
-#include <cuda_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
    return expf(val);
 }

-#ifndef ENABLE_ILUVATAR_CUDA_API
+#ifndef ENABLE_ILUVATAR_API
 __forceinline__ __device__ long double
 exp_(const long double val) {
    return expl(val);
@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
 exp_(const __nv_bfloat16 x) {
    return hexp(x);
 }
-#endif
--- a/src/infiniop/elementwise/maca/elementwise_maca.h
+++ b/src/infiniop/elementwise/maca/elementwise_maca.h
-#ifndef __INFINIOP_ELEMENTWISE_MACA_H__
-#define __INFINIOP_ELEMENTWISE_MACA_H__
+#ifndef __INFINIOP_ELEMENTWISE_METAX_H__
+#define __INFINIOP_ELEMENTWISE_METAX_H__

 #include "../../../utils.h"
-#include "../../devices/maca/common_maca.h"
-#include "../../devices/maca/maca_kernel_common.h"
-#include "elementwise_maca_api.h"
+#include "../../devices/metax/metax_common.h"
+#include "../../devices/metax/metax_kernel_common.h"
+#include "elementwise_metax_api.h"

-namespace op::elementwise::maca {
+namespace op::elementwise::metax {
 template <typename T>
 __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
    return reinterpret_cast<const T *>(ptr);
@@ -14,7 +14,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {

 __device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
                                                 const size_t *shape, const ptrdiff_t *strides) {
-    return is_contiguous ? idx : device::maca::indexToOffset(idx, ndim, shape, strides);
+    return is_contiguous ? idx : device::metax::indexToOffset(idx, ndim, shape, strides);
 }

 struct InputIndexer {
@@ -30,8 +30,8 @@ struct InputIndexer {
        return input_contiguous[input_id]
                 ? idx
                 : (input_broadcasted[input_id]
-                        ? device::maca::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
-                        : device::maca::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
+                        ? device::metax::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
+                        : device::metax::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
    }
 };

@@ -41,7 +41,7 @@ __device__ __forceinline__ void unpackInputsAndApply(F &&f, std::index_sequence<
 }

 template <size_t N, typename Op, typename Tdata, typename... Args>
-INFINIOP_MACA_KERNEL elementwiseKernel(
+INFINIOP_METAX_KERNEL elementwiseKernel(
    size_t output_size,
    size_t ndim,
    bool output_contiguous,
@@ -72,7 +72,7 @@ INFINIOP_MACA_KERNEL elementwiseKernel(
 }

 template <typename Op, typename Tout, typename... Tin>
-INFINIOP_MACA_KERNEL elementwiseKernel(
+INFINIOP_METAX_KERNEL elementwiseKernel(
    size_t output_size,
    size_t ndim,
    bool output_contiguous,
@@ -102,9 +102,9 @@ INFINIOP_MACA_KERNEL elementwiseKernel(
 }

 struct DeviceImpl::Opaque {
-    std::shared_ptr<device::maca::Handle::Internal> internal;
+    std::shared_ptr<device::metax::Handle::Internal> internal;

-    Opaque(const std::shared_ptr<device::maca::Handle::Internal> &internal)
+    Opaque(const std::shared_ptr<device::metax::Handle::Internal> &internal)
        : internal(internal) {}

    template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
@@ -159,8 +159,8 @@ private:
        const int8_t *d_meta_start = reinterpret_cast<int8_t *>(workspace) + input_arr_size;

        // copy the input pointer array and meta to device
-        CHECK_MACA(hcMemcpyAsync(workspace, h_inputs_arr, input_arr_size, hcMemcpyHostToDevice, stream));
-        CHECK_MACA(hcMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), hcMemcpyHostToDevice, stream));
+        CHECK_METAX(hcMemcpyAsync(workspace, h_inputs_arr, input_arr_size, hcMemcpyHostToDevice, stream));
+        CHECK_METAX(hcMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), hcMemcpyHostToDevice, stream));

        // offset/assign the pointers
        d_inputs_arr = reinterpret_cast<const void **>(workspace);
@@ -259,6 +259,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
        std::forward<Args>(args)...);
 }

-} // namespace op::elementwise::maca
+} // namespace op::elementwise::metax

 #endif
--- a/src/infiniop/elementwise/maca/elementwise_maca_api.h
+++ b/src/infiniop/elementwise/maca/elementwise_maca_api.h
-#ifndef __INFINIOP_ELEMENTWISE_MACA_API_H__
-#define __INFINIOP_ELEMENTWISE_MACA_API_H__
+#ifndef __INFINIOP_ELEMENTWISE_METAX_API_H__
+#define __INFINIOP_ELEMENTWISE_METAX_API_H__

 #include "../elementwise.h"

-namespace op::elementwise::maca {
+namespace op::elementwise::metax {

 class DeviceImpl final {
    struct Opaque;
@@ -37,23 +37,23 @@ public:
        void *stream,
        Args &&...args);
 };
-} // namespace op::elementwise::maca
-#define CREATE_ELEMENTWISE_MACA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)          \
-                                                                                             \
-    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);   \
-    CHECK_RESULT(info_result);                                                               \
-    auto info = info_result.take();                                                          \
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);      \
-                                                                                             \
-    auto device_impl_result = op::elementwise::maca::DeviceImpl::create(HANDLE->internal()); \
-    CHECK_RESULT(device_impl_result);                                                        \
-                                                                                             \
-    *desc_ptr = new Descriptor(                                                              \
-        DTYPE,                                                                               \
-        std::move(info),                                                                     \
-        std::move(device_impl_result.take()),                                                \
-        workspace_size,                                                                      \
-        HANDLE->device,                                                                      \
+} // namespace op::elementwise::metax
+#define CREATE_ELEMENTWISE_METAX_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)          \
+                                                                                              \
+    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);    \
+    CHECK_RESULT(info_result);                                                                \
+    auto info = info_result.take();                                                           \
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);       \
+                                                                                              \
+    auto device_impl_result = op::elementwise::metax::DeviceImpl::create(HANDLE->internal()); \
+    CHECK_RESULT(device_impl_result);                                                         \
+                                                                                              \
+    *desc_ptr = new Descriptor(                                                               \
+        DTYPE,                                                                                \
+        std::move(info),                                                                      \
+        std::move(device_impl_result.take()),                                                 \
+        workspace_size,                                                                       \
+        HANDLE->device,                                                                       \
        HANDLE->device_id);

-#endif // __INFINIOP_ELEMENTWISE_MACA_API_H__
+#endif // __INFINIOP_ELEMENTWISE_METAX_API_H__
--- a/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
@@ -2,11 +2,11 @@
 #define __INFINIOP_ELEMENTWISE_CUDA_H__

 #include "../../../utils.h"
-#include "../../devices/cuda/cuda_common.cuh"
-#include "../../devices/cuda/cuda_kernel_common.cuh"
-#include "elementwise_cuda_api.cuh"
+#include "../../devices/nvidia/nvidia_common.cuh"
+#include "../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "elementwise_nvidia_api.cuh"

-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {

 /**
 * @brief Casts an untyped device pointer to a typed pointer of type T.
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
 */
 __device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
                                                 const size_t *shape, const ptrdiff_t *strides) {
-    return is_contiguous ? idx : device::cuda::indexToOffset(idx, ndim, shape, strides);
+    return is_contiguous ? idx : device::nvidia::indexToOffset(idx, ndim, shape, strides);
 }

 /**
@@ -61,8 +61,8 @@ struct InputIndexer {
        return input_contiguous[input_id]
                 ? idx
                 : (input_broadcasted[input_id]
-                        ? device::cuda::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
-                        : device::cuda::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
+                        ? device::nvidia::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
+                        : device::nvidia::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
    }
 };

@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
 }

 struct DeviceImpl::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;

-    Opaque(const std::shared_ptr<device::cuda::Handle::Internal> &internal)
+    Opaque(const std::shared_ptr<device::nvidia::Handle::Internal> &internal)
        : internal(internal) {}

    /**
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
        std::forward<Args>(args)...);
 }

-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia

 #endif // __INFINIOP_ELEMENTWISE_CUDA_H__
--- a/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda_api.cuh
@@ -3,7 +3,7 @@

 #include "../elementwise.h"

-namespace op::elementwise::cuda {
+namespace op::elementwise::nvidia {

 /**
 * @brief Define the methods and info needed by CUDA to perform elementwise operation
@@ -77,7 +77,7 @@ public:
        void *stream,
        Args &&...args);
 };
-} // namespace op::elementwise::cuda
+} // namespace op::elementwise::nvidia

 /**
 * @brief Define the process for initializing a Descriptor of an elementwise operation
@@ -88,22 +88,22 @@ public:
 * @param OUT_DESC       The output tensor descriptor.
 * @param INPUT_DESC_VEC A vector containing input tensor descriptors.
 */
-#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)          \
-                                                                                             \
-    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);   \
-    CHECK_RESULT(info_result);                                                               \
-    auto info = info_result.take();                                                          \
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);      \
-                                                                                             \
-    auto device_impl_result = op::elementwise::cuda::DeviceImpl::create(HANDLE->internal()); \
-    CHECK_RESULT(device_impl_result);                                                        \
-                                                                                             \
-    *desc_ptr = new Descriptor(                                                              \
-        DTYPE,                                                                               \
-        std::move(info),                                                                     \
-        std::move(device_impl_result.take()),                                                \
-        workspace_size,                                                                      \
-        HANDLE->device,                                                                      \
+#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)            \
+                                                                                               \
+    auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);     \
+    CHECK_RESULT(info_result);                                                                 \
+    auto info = info_result.take();                                                            \
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);        \
+                                                                                               \
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(HANDLE->internal()); \
+    CHECK_RESULT(device_impl_result);                                                          \
+                                                                                               \
+    *desc_ptr = new Descriptor(                                                                \
+        DTYPE,                                                                                 \
+        std::move(info),                                                                       \
+        std::move(device_impl_result.take()),                                                  \
+        workspace_size,                                                                        \
+        HANDLE->device,                                                                        \
        HANDLE->device_id);

 #endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
--- a/src/infiniop/ninetoothed/build.py
+++ b/src/infiniop/ninetoothed/build.py
+import functools
+import inspect
+import itertools
+import pathlib
+
+import ninetoothed
+from ninetoothed.aot import _HEADER_PATH
+
+CURRENT_FILE_PATH = pathlib.Path(__file__)
+
+BUILD_DIRECTORY_PATH = (
+    CURRENT_FILE_PATH.parent.parent.parent.parent / "build" / "ninetoothed"
+)
+
+
+def build(premake, constexpr_param_grid, caller, op_name, output_dir):
+    headers = []
+    all_param_names = []
+    launches = []
+
+    for combination in _generate_param_value_combinations(constexpr_param_grid):
+        arrangement, application, tensors = premake(**combination)
+
+        for param_name, param_value in combination.items():
+            if isinstance(param_value, str):
+                combination[param_name] = (
+                    f"INFINI_DTYPE_{combination[param_name].replace('fp', 'F').upper()}"
+                )
+
+        combination = {f"{name}_": value for name, value in combination.items()}
+
+        kernel_name = f"{op_name}_{_generate_suffix(combination.values())}"
+
+        ninetoothed.make(
+            arrangement,
+            application,
+            tensors,
+            caller=caller,
+            kernel_name=kernel_name,
+            output_dir=output_dir,
+        )
+
+        header = output_dir / f"{kernel_name}.h"
+        param_names = ("stream",) + tuple(
+            inspect.signature(application).parameters.keys()
+        )
+        launch = f"""    if ({_generate_condition(combination)})
+        return launch_{kernel_name}({", ".join(param_names)});"""
+
+        headers.append(header)
+        all_param_names.append(param_names)
+        launches.append(launch)
+
+    includes = "\n".join(f'#include "{header}"' for header in headers)
+
+    param_names = list(
+        functools.reduce(
+            lambda x, y: dict.fromkeys(x) | dict.fromkeys(y),
+            sorted(all_param_names, key=len, reverse=True),
+            {},
+        )
+    )
+    param_types = [
+        "NineToothedStream",
+    ] + ["NineToothedTensor" for _ in range(len(param_names) - 1)]
+
+    for param_name in combination:
+        param_names.append(param_name)
+        param_types.append("int")
+
+    param_decls = ", ".join(
+        f"{type} {param}" for param, type in zip(param_names, param_types)
+    )
+
+    source_file_name = f"{op_name}.c"
+    header_file_name = f"{op_name}.h"
+
+    func_sig = f"NineToothedResult launch_{op_name}({param_decls})"
+
+    joined_launches = "\n".join(launches)
+
+    op_decl = f'#ifdef __cplusplus\nextern "C" {func_sig};\n#else\n{func_sig};\n#endif'
+    op_def = f"""{func_sig} {{
+{joined_launches}
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+}}"""
+
+    source_content = f"""#include "{header_file_name}"
+
+#include "infinicore.h"
+
+{includes}\n\n{op_def}\n"""
+    header_content = f"""#include "{_HEADER_PATH}"
+\n{op_decl}\n"""
+
+    (BUILD_DIRECTORY_PATH / source_file_name).write_text(source_content)
+    (BUILD_DIRECTORY_PATH / header_file_name).write_text(header_content)
+
+
+def _generate_condition(combination):
+    return " && ".join(f"{param} == {value}" for param, value in combination.items())
+
+
+def _generate_suffix(values):
+    return "_".join(f"{value}" for value in values)
+
+
+def _generate_param_value_combinations(param_grid):
+    keys = list(param_grid.keys())
+    value_combinations = itertools.product(*param_grid.values())
+
+    return tuple(dict(zip(keys, combination)) for combination in value_combinations)
--- a/src/infiniop/ops/add/cpu/add_cpu.cc
+++ b/src/infiniop/ops/add/cpu/add_cpu.cc
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
    const auto &a_shape = a_desc->shape();
    const auto &b_shape = b_desc->shape();

-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);

    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);

@@ -43,6 +43,8 @@ infiniStatus_t Descriptor::calculate(
        return _device_info->calculate<AddOp, float>(_info, output, inputs, stream);
    case INFINI_DTYPE_F64:
        return _device_info->calculate<AddOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AddOp, bf16_t>(_info, output, inputs, stream);
    default:
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }

--- a/src/infiniop/ops/add/cuda/add_cuda_internal.cuh
+++ b/src/infiniop/ops/add/cuda/add_cuda_internal.cuh
 #ifndef __ADD_CUDA_H__
 #define __ADD_CUDA_H__

-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
-#include <cuda_fp16.h>
-
 namespace op::add::cuda {
 typedef struct AddOp {
 public:
@@ -12,7 +9,7 @@ public:
    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
        if constexpr (std::is_same_v<T, half2>) {
            return __hadd2(a, b);
-        } else if constexpr (std::is_same_v<T, half>) {
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
            return __hadd(a, b);
        } else if constexpr (std::is_same_v<T, float>) {
            return __fadd_rd(a, b);