Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -3,17 +3,25 @@
 #include <pybind11/pybind11.h>

 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
+#include "ops/flash_attention.hpp"
+#include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
+#include "ops/linear_w8a8i8.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
+#include "ops/paged_attention.hpp"
+#include "ops/paged_attention_prefill.hpp"
+#include "ops/paged_caching.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
+#include "ops/silu_and_mul.hpp"
 #include "ops/swiglu.hpp"

 namespace py = pybind11;
@@ -22,18 +30,26 @@ namespace infinicore::ops {

 inline void bind(py::module &m) {
    bind_add(m);
+    bind_add_rms_norm(m);
    bind_attention(m);
    bind_causal_softmax(m);
-    bind_random_sample(m);
+    bind_flash_attention(m);
+    bind_kv_caching(m);
    bind_linear(m);
    bind_matmul(m);
    bind_mul(m);
+    bind_paged_attention(m);
+    bind_paged_attention_prefill(m);
+    bind_paged_caching(m);
+    bind_random_sample(m);
    bind_rearrange(m);
    bind_rms_norm(m);
    bind_silu(m);
    bind_swiglu(m);
    bind_rope(m);
    bind_embedding(m);
+    bind_linear_w8a8i8(m);
+    bind_silu_and_mul(m);
 }

 } // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/add_rms_norm.hpp
+++ b/src/infinicore/pybind11/ops/add_rms_norm.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/add_rms_norm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_add_rms_norm(py::module &m) {
+    m.def("add_rms_norm",
+          &op::add_rms_norm,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(Fused Add and RMS Normalization.
+
+Args:
+    a: First input tensor
+    b: Second input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+
+Returns:
+    Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
+    The add_result can be used as residual for subsequent layers.
+)doc");
+
+    m.def("add_rms_norm_",
+          &op::add_rms_norm_,
+          py::arg("y"),
+          py::arg("residual_out"),
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(In-place Fused Add and RMS Normalization.
+
+Args:
+    y: Output tensor for normalized result
+    residual_out: Output tensor for add result (a + b) before normalization
+    a: First input tensor
+    b: Second input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/flash_attention.hpp
+++ b/src/infinicore/pybind11/ops/flash_attention.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/flash_attention.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_flash_attention(py::module &m) {
+    m.def("flash_attention",
+          &op::flash_attention,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("total_kv_len"),
+          py::arg("scale"),
+          py::arg("is_causal"));
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/kv_caching.hpp
+++ b/src/infinicore/pybind11/ops/kv_caching.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/kv_caching.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_kv_caching(py::module &m) {
+    m.def("kv_caching_",
+          &op::kv_caching_,
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("past_kv_lengths"),
+          R"doc(In-place Key-Value Caching.
+
+Updates the KV cache in-place with new key and value tensors.
+
+Args:
+    k_cache: Key cache tensor to update in-place
+    v_cache: Value cache tensor to update in-place
+    k: New key tensor to append
+    v: New value tensor to append
+    past_kv_lengths: Tensor containing current sequence lengths for each batch
+)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/linear_w8a8i8.hpp
+++ b/src/infinicore/pybind11/ops/linear_w8a8i8.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/linear_w8a8i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_linear_w8a8i8(Tensor input,
+                        Tensor weight_packed,
+                        Tensor weight_scale,
+                        pybind11::object bias) {
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+    return op::linear_w8a8i8(input, weight_packed, weight_scale, bias_tensor);
+}
+
+void py_linear_w8a8i8_(Tensor out,
+                       Tensor input,
+                       Tensor weight_packed,
+                       Tensor weight_scale,
+                       pybind11::object bias) {
+
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+
+    op::linear_w8a8i8_(out, input, weight_packed, weight_scale, bias_tensor);
+}
+
+inline void bind_linear_w8a8i8(py::module &m) {
+    m.def("linear_w8a8i8",
+          &ops::py_linear_w8a8i8,
+          py::arg("input"),
+          py::arg("weight_packed"),
+          py::arg("weight_scale"),
+          py::arg("bias") = py::none(),
+          R"doc(linear_w8a8i8.)doc");
+    m.def("linear_w8a8i8_",
+          &ops::py_linear_w8a8i8_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("weight_packed"),
+          py::arg("weight_scale"),
+          py::arg("bias") = py::none(),
+          R"doc(linear_w8a8i8_.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/paged_attention.hpp
+++ b/src/infinicore/pybind11/ops/paged_attention.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/paged_attention.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, pybind11::object alibi_slopes, float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    return op::paged_attention(q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes_tensor, scale);
+}
+
+void py_paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, pybind11::object alibi_slopes, float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+
+    op::paged_attention_(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes_tensor, scale);
+}
+
+inline void bind_paged_attention(py::module &m) {
+    m.def("paged_attention",
+          &ops::py_paged_attention,
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("cache_lens"),
+          py::arg("alibi_slopes"),
+          py::arg("scale"),
+          R"doc(Paged attention of query and key cache tensors.)doc");
+
+    m.def("paged_attention_",
+          &ops::py_paged_attention_,
+          py::arg("out"),
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("cache_lens"),
+          py::arg("alibi_slopes"),
+          py::arg("scale"),
+          R"doc(In-place paged attention of query and key cache tensors.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/paged_attention_prefill.hpp
+++ b/src/infinicore/pybind11/ops/paged_attention_prefill.hpp
+#pragma once
+
+#include "infinicore/ops/paged_attention_prefill.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_paged_attention_prefill(Tensor q,
+                                  Tensor k_cache,
+                                  Tensor v_cache,
+                                  Tensor block_tables,
+                                  Tensor history_lens,
+                                  Tensor cu_seqlens_q,
+                                  py::object alibi_slopes,
+                                  float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    return op::paged_attention_prefill(
+        q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
+}
+
+void py_paged_attention_prefill_(Tensor out,
+                                 Tensor q,
+                                 Tensor k_cache,
+                                 Tensor v_cache,
+                                 Tensor block_tables,
+                                 Tensor history_lens,
+                                 Tensor cu_seqlens_q,
+                                 py::object alibi_slopes,
+                                 float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    op::paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
+}
+
+inline void bind_paged_attention_prefill(py::module &m) {
+    m.def("paged_attention_prefill",
+          &ops::py_paged_attention_prefill,
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("history_lens"),
+          py::arg("cu_seqlens_q"),
+          py::arg("alibi_slopes") = py::none(),
+          py::arg("scale") = 1.0,
+          R"doc(Paged attention prefill for packed variable-length queries.)doc");
+
+    m.def("paged_attention_prefill_",
+          &ops::py_paged_attention_prefill_,
+          py::arg("out"),
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("history_lens"),
+          py::arg("cu_seqlens_q"),
+          py::arg("alibi_slopes") = py::none(),
+          py::arg("scale") = 1.0,
+          R"doc(In-place paged attention prefill for packed variable-length queries.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/paged_caching.hpp
+++ b/src/infinicore/pybind11/ops/paged_caching.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/paged_caching.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_paged_caching(py::module &m) {
+    m.def("paged_caching_",
+          &op::paged_caching_,
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("slot_mapping"),
+          R"doc(Paged caching of key and value tensors.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp
+++ b/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_per_channel_quant_i8(py::module &m) {
+    m.def("per_channel_quant_i8_",
+          &op::per_channel_quant_i8_,
+          py::arg("x"),
+          py::arg("x_packed"),
+          py::arg("x_scale"),
+          R"doc(Per-channel quantization of a tensor.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/scaled_mm_i8.hpp
+++ b/src/infinicore/pybind11/ops/scaled_mm_i8.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_scaled_mm_i8(py::module &m) {
+    m.def("scaled_mm_i8",
+          &op::scaled_mm_i8,
+          py::arg("a_p"),
+          py::arg("a_s"),
+          py::arg("b_p"),
+          py::arg("b_s"),
+          py::arg("bias"),
+          R"doc(Scaled matrix multiplication of two tensors.)doc");
+
+    m.def("scaled_mm_i8_",
+          &op::scaled_mm_i8_,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("a_scale"),
+          py::arg("b_scale"),
+          R"doc(In-place Scaled matrix multiplication of two tensors.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/silu_and_mul.hpp
+++ b/src/infinicore/pybind11/ops/silu_and_mul.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/silu_and_mul.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_silu_and_mul(py::module &m) {
+    m.def("silu_and_mul",
+          &op::silu_and_mul,
+          py::arg("input"),
+          R"doc(
+          SiLU and Mul (SwiGLU) activation function.
+          Input should be [..., 2*d], output will be [..., d].
+          )doc");
+
+    m.def("silu_and_mul_",
+          &op::silu_and_mul_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(
+          In-place or destination-specified SiLU and Mul (SwiGLU) activation function.
+          )doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/tensor/copy.cc
+++ b/src/infinicore/tensor/copy.cc
@@ -19,7 +19,8 @@ Tensor TensorImpl::to(Device device) const {

 void TensorImpl::copy_from(Tensor src) {
    if (src->shape() != this->shape()) {
-        throw std::runtime_error("Cannot copy from tensor with different shape");
+        throw std::runtime_error(
+            "Cannot copy from tensor with different shape. Src: " + src->info() + " Dst: " + this->info());
    }
    if (this->device() == src->device()) {
        op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), src);
@@ -31,12 +32,13 @@ void TensorImpl::copy_from(Tensor src) {
        // Use nbytes() to get the actual tensor size, not the full memory size
        size_t copy_size = std::min(this->nbytes(), src->nbytes());
        if (this->device().getType() == Device::Type::CPU) {
-            context::setDevice(src->device());
            if (this->is_contiguous()) {
+                context::setDevice(src->device());
                context::memcpyD2H(this->data(), src->data(), copy_size);
            } else {
                auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
-                context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
+                context::setDevice(src->device());
+                context::memcpyD2H(local_src->data(), src->data(), copy_size);
                op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
            }
        } else if (src->device().getType() == Device::Type::CPU) {

--- a/src/infinicore/tensor/debug.cc
+++ b/src/infinicore/tensor/debug.cc
@@ -95,6 +95,20 @@ void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &st
    }
 }

+// Function for printing I8 data
+void print_data_i8(const int8_t *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << static_cast<int>(data[i * strides[dim]]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data_i8(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
 // Template function for writing data recursively to binary file (handles non-contiguous tensors)
 template <typename T>
 void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) {
@@ -181,8 +195,8 @@ void TensorImpl::debug(const std::string &filename) const {
                   cpu_tensor->shape(), cpu_tensor->strides(), 0);
        break;
    case DataType::I8:
-        print_data(reinterpret_cast<const int8_t *>(cpu_data),
-                   cpu_tensor->shape(), cpu_tensor->strides(), 0);
+        print_data_i8(reinterpret_cast<const int8_t *>(cpu_data),
+                      cpu_tensor->shape(), cpu_tensor->strides(), 0);
        break;
    case DataType::BF16:
        print_data_bf16(reinterpret_cast<const uint16_t *>(cpu_data),

--- a/src/infinicore/tensor/tensor.cc
+++ b/src/infinicore/tensor/tensor.cc
 #include "infinicore/tensor.hpp"
+#include "../context/internal.hpp"
 #include "../utils.hpp"
 #include "infinicore/context/context.hpp"
 #include "infinicore/dtype.hpp"
@@ -275,4 +276,24 @@ std::shared_ptr<TensorImpl> TensorImpl::strided_from_blob(
    return t;
 }

+Tensor TensorImpl::to_blob_() const {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape(), strides(), dtype()));
+    t->data_.offset = this->data_.offset;
+    t->data_.memory = std::make_shared<Memory>(this->data_.memory->data(), this->data_.memory->size(), this->data_.memory->device(), nullptr);
+    t->to_blob_mark_ = true;
+    return Tensor{t};
+}
+
+Tensor TensorImpl::resume_from_blob_() const {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape(), strides(), dtype()));
+    t->data_.offset = this->data_.offset;
+    if (to_blob_mark_) {
+        t->data_.memory = context::reinstantiateBlob(this->data_.memory);
+    } else {
+        t->data_.memory = this->data_.memory;
+    }
+
+    return Tensor{t};
+}
+
 } // namespace infinicore
--- a/src/infinicore/tensor/view.cc
+++ b/src/infinicore/tensor/view.cc
@@ -2,6 +2,8 @@
 #include "infinicore/dtype.hpp"
 #include "infinicore/tensor.hpp"

+#include "../utils.hpp"
+
 #include <spdlog/spdlog.h>
 #include <stdexcept>

@@ -62,11 +64,11 @@ Tensor TensorImpl::narrow(const std::vector<TensorSliceParams> &slices) const {

 Tensor TensorImpl::permute(const Shape &order) const {
    // Validate input
-    assert(meta_.shape.size() == order.size());
+    INFINICORE_ASSERT(meta_.shape.size() == order.size());

    // Check that order contains all indices from 0 to n-1 exactly once
    for (size_t i = 0; i < order.size(); i++) {
-        assert(std::find(order.begin(), order.end(), i) != order.end());
+        INFINICORE_ASSERT(std::find(order.begin(), order.end(), i) != order.end());
    }

    // Permute shape and strides

--- a/src/infinicore/utils.hpp
+++ b/src/infinicore/utils.hpp
@@ -23,14 +23,17 @@ inline struct SpdlogInitializer {
 #define STRINGIZE_(x) #x
 #define STRINGIZE(x) STRINGIZE_(x)

-#define INFINICORE_CHECK_ERROR(call)                                                                         \
-    do {                                                                                                     \
-        SPDLOG_DEBUG("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
-        infiniStatus_t ret = (call);                                                                         \
-        SPDLOG_DEBUG("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                      \
-        if (ret != INFINI_STATUS_SUCCESS) {                                                                  \
-            throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
-        }                                                                                                    \
+#define INFINICORE_CHECK_ERROR(call)                                                                            \
+    do {                                                                                                        \
+        SPDLOG_DEBUG("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                        \
+        infiniStatus_t ret = (call);                                                                            \
+        SPDLOG_DEBUG("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                         \
+        if (ret != INFINI_STATUS_SUCCESS) {                                                                     \
+            throw std::runtime_error("`" #call "` failed with error: " + std::string(infini_status_string(ret)) \
+                                     + " from " + std::string(__func__)                                         \
+                                     + " at " + std::string(__FILE__)                                           \
+                                     + ":" + std::to_string(__LINE__) + ".");                                   \
+        }                                                                                                       \
    } while (false)

 #define INFINICORE_ASSERT_TENSORS_SAME_DEVICE(FIRST___, ...)                      \
@@ -47,3 +50,14 @@ inline struct SpdlogInitializer {
            }                                                                     \
        }                                                                         \
    } while (0)
+
+#define INFINICORE_ASSERT(CONDITION__)                                                                                                         \
+    do {                                                                                                                                       \
+        if (!(CONDITION__)) {                                                                                                                  \
+            SPDLOG_ERROR(                                                                                                                      \
+                "Assertion `{}` failed from {} at {}:{}",                                                                                      \
+                #CONDITION__, __func__, __FILE__, __LINE__);                                                                                   \
+            throw std::runtime_error(                                                                                                          \
+                std::string("Assertion `") + #CONDITION__ + "` failed from " + __func__ + " at " + __FILE__ + ":" + std::to_string(__LINE__)); \
+        }                                                                                                                                      \
+    } while (0)
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
@@ -22,7 +22,7 @@ void printUsage() {
    std::cout << "    Path to the test gguf file" << std::endl
              << std::endl;
    std::cout << "  --<device>[:id]" << std::endl;
-    std::cout << "    (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon) and device ID (optional). CPU by default." << std::endl
+    std::cout << "    (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon|ali) and device ID (optional). CPU by default." << std::endl
              << std::endl;
    std::cout << "  --warmup <warmups>" << std::endl;
    std::cout << "    (Optional) Number of warmups to perform before timing. Default to 0." << std::endl
@@ -80,6 +80,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
            PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
            PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
            PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
+            PARSE_DEVICE("--ali", INFINI_DEVICE_ALI)
            else if (arg == "--warmup" && i + 1 < argc) {
                args.warmups = std::stoi(argv[++i]);
            }

--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/cpu_handle.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
 #include "nvidia/nvidia_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -47,6 +47,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
 #ifdef ENABLE_ILUVATAR_API
        CREATE(INFINI_DEVICE_ILUVATAR, iluvatar);
 #endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, ali);
+#endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, qy);
 #endif
@@ -93,6 +96,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
 #ifdef ENABLE_ILUVATAR_API
        DELETE(INFINI_DEVICE_ILUVATAR, iluvatar);
 #endif
+#ifdef ENABLE_ALI_API
+        DELETE(INFINI_DEVICE_ALI, ali);
+#endif
 #ifdef ENABLE_QY_API
        DELETE(INFINI_DEVICE_QY, qy);
 #endif

--- a/src/infiniop/devices/metax/metax_ht2mc.h
+++ b/src/infiniop/devices/metax/metax_ht2mc.h
@@ -85,4 +85,20 @@
 #define hcclSuccess mcclSuccess
 #define hcclCommDestroy mcclCommDestroy
 #define hcclAllReduce mcclAllReduce
+#define hcGetDevice mcGetDevice
+#define hcDeviceAttributeMultiProcessorCount mcDeviceAttributeMultiProcessorCount
+#define hcDeviceGetAttribute mcDeviceGetAttribute
+#define hcStreamCaptureMode mcStreamCaptureMode
+#define hcStreamCaptureModeGlobal mcStreamCaptureModeGlobal
+#define hcStreamCaptureModeThreadLocal mcStreamCaptureModeThreadLocal
+#define hcStreamCaptureModeRelaxed mcStreamCaptureModeRelaxed
+#define hcStreamBeginCapture mcStreamBeginCapture
+#define hcStreamEndCapture mcStreamEndCapture
+#define hcGraph_t mcGraph_t
+#define hcGraphExec_t mcGraphExec_t
+#define hcGraphNode_t mcGraphNode_t
+#define hcGraphInstantiate mcGraphInstantiate
+#define hcGraphDestroy mcGraphDestroy
+#define hcGraphExecDestroy mcGraphExecDestroy
+#define hcGraphLaunch mcGraphLaunch
 #endif
--- a/src/infiniop/devices/metax/metax_kernel_common.h
+++ b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -8,8 +8,10 @@

 // Posible maximum number of threads per block for METAX architectures
 // Used for picking correct kernel launch configuration
-#define METAX_BLOCK_SIZE_1024 1024
 #define METAX_BLOCK_SIZE_512 512
+#define METAX_BLOCK_SIZE_1024 1024
+#define METAX_BLOCK_SIZE_2048 2048
+#define METAX_BLOCK_SIZE_4096 4096

 #define CHECK_METAX(API) CHECK_INTERNAL(API, hcSuccess)

@@ -17,6 +19,12 @@ using cuda_bfloat16 = hpcc_bfloat16;
 using cuda_bfloat162 = hpcc_bfloat162;
 using cuda_fp8_e4m3 = __hpcc_fp8_e4m3;

+#ifdef ENABLE_METAX_MC_API
+using __nv_bfloat16 = __maca_bfloat16;
+#else
+using __nv_bfloat16 = __hpcc_bfloat16;
+#endif
+
 namespace device::metax {

 // get the memory offset of the given element in a tensor given its flat index