Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/src/infinicore/context/allocators/device_caching_allocator.hpp
+++ b/src/infinicore/context/allocators/device_caching_allocator.hpp
@@ -5,10 +5,10 @@
 #include "../context_impl.hpp"

 namespace infinicore {
-class DeviceCachingAllocator : public MemoryAllocator {
+class StreamOrderedAllocator : public MemoryAllocator {
 public:
-    explicit DeviceCachingAllocator(Device device);
-    ~DeviceCachingAllocator() = default;
+    explicit StreamOrderedAllocator(Device device);
+    ~StreamOrderedAllocator() = default;

    std::byte *allocate(size_t size) override;
    void deallocate(std::byte *ptr) override;

--- a/src/infinicore/context/context_impl.cc
+++ b/src/infinicore/context/context_impl.cc
 #include "context_impl.hpp"
+#include "internal.hpp"

 #include "../utils.hpp"

@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() {
    return current_runtime_;
 }

-Runtime *ContextImpl::getCpuRuntime() {
-    return runtime_table_[int(Device::Type::CPU)][0].get();
-}
-
 void ContextImpl::setDevice(Device device) {
    if (device == getCurrentRuntime()->device()) {
        // Do nothing if the device is already set.
        return;
    }

+    thread_local bool warn_switch_runtime = false;
+    if (getCurrentRuntime()->isGraphRecording() && !warn_switch_runtime) {
+        spdlog::warn("Switching device runtime during graph recording may break the graph!");
+        warn_switch_runtime = true;
+    }
+
    if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {
        // Lazy initialization of runtime if never set before.
        runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr<Runtime>(new Runtime(device));
@@ -100,11 +103,8 @@ infinirtStream_t getStream() {
 }

 infiniopHandle_t getInfiniopHandle(Device device) {
-    if (device.getType() == Device::Type::CPU) {
-        return ContextImpl::singleton().getCpuRuntime()->infiniopHandle();
-    }
    if (device != getDevice()) {
-        throw std::runtime_error("Requested device doesn't match current runtime.");
+        setDevice(device);
    }
    return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle();
 }
@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
 }

 std::shared_ptr<Memory> allocateHostMemory(size_t size) {
-    return ContextImpl::singleton().getCpuRuntime()->allocateMemory(size);
+    setDevice(Device::cpu());
+    return allocateMemory(size);
 }

 std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) {
@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
 }

 void memcpyH2H(void *dst, const void *src, size_t size) {
-    return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size);
+    setDevice(Device::cpu());
+    return ContextImpl::singleton().getCurrentRuntime()->memcpyD2D(dst, src, size);
 }

 // Timing API implementations
@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
    ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event);
 }

+bool isGraphRecording() {
+    return ContextImpl::singleton().getCurrentRuntime()->isGraphRecording();
+}
+
+void startGraphRecording() {
+    ContextImpl::singleton().getCurrentRuntime()->startGraphRecording();
+}
+
+void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
+    ContextImpl::singleton().getCurrentRuntime()->addGraphOperator(op);
+}
+
+std::shared_ptr<graph::Graph> stopGraphRecording() {
+    return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
+}
+
+std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob) {
+    setDevice(blob->device());
+    return ContextImpl::singleton().getCurrentRuntime()->reinstantiateBlob(blob);
+}
+
 } // namespace context

 } // namespace infinicore
--- a/src/infinicore/context/context_impl.hpp
+++ b/src/infinicore/context/context_impl.hpp
@@ -19,8 +19,6 @@ protected:
 public:
    Runtime *getCurrentRuntime();

-    Runtime *getCpuRuntime();
-
    void setDevice(Device);

    size_t getDeviceCount(Device::Type type);

--- a/src/infinicore/context/internal.hpp
+++ b/src/infinicore/context/internal.hpp
+#pragma once
+
+#include "infinicore/device.hpp"
+#include "infinicore/memory.hpp"
+
+#include "infinicore/graph/graph.hpp"
+
+namespace infinicore::context {
+std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
+};
--- a/src/infinicore/context/runtime/runtime.cc
+++ b/src/infinicore/context/runtime/runtime.cc
@@ -2,19 +2,20 @@

 #include "../../utils.hpp"

-#include "../allocators/device_caching_allocator.hpp"
 #include "../allocators/device_pinned_allocator.hpp"
 #include "../allocators/host_allocator.hpp"
+#include "../allocators/pinnable_block_allocator.hpp"
+#include "../allocators/stream_ordered_allocator.hpp"

 namespace infinicore {
-Runtime::Runtime(Device device) : device_(device) {
+Runtime::Runtime(Device device) : device_(device), graph_manager_(std::make_unique<graph::GraphManager>()) {
    activate();
    INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_));
    INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_));
    if (device_.getType() == Device::Type::CPU) {
-        device_memory_allocator_ = std::make_unique<HostAllocator>();
+        device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
    } else {
-        device_memory_allocator_ = std::make_unique<DeviceCachingAllocator>(device);
+        device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
        pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device);
    }
 }
@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
        true);
 }

+std::shared_ptr<Memory> Runtime::reinstantiateBlob(std::shared_ptr<Memory> blob) {
+    device_memory_allocator_.get()->mark_in_use_(blob->data(), true);
+    return std::make_shared<Memory>(
+        blob->data(), blob->size(), device_,
+        [alloc = device_memory_allocator_.get()](std::byte *p) {
+            alloc->deallocate(p);
+        });
+}
+
 void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) {
    if (async) {
        INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_));
@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
    INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event));
 }

+bool Runtime::isGraphRecording() const {
+    return graph_manager_->is_recording();
+}
+
+void Runtime::startGraphRecording() {
+    device_memory_allocator_->set_pin_mode(true);
+    return graph_manager_->start_recording();
+}
+
+void Runtime::addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
+    return graph_manager_->add_operator(op);
+}
+
+std::shared_ptr<graph::Graph> Runtime::stopGraphRecording() {
+    auto graph = graph_manager_->stop_recording();
+    device_memory_allocator_->set_pin_mode(false);
+    return graph;
+}
+
 std::string Runtime::toString() const {
    return fmt::format("Runtime({})", device_.toString());
 }

--- a/src/infinicore/context/runtime/runtime.hpp
+++ b/src/infinicore/context/runtime/runtime.hpp
 #pragma once

-#include "../allocators/memory_allocator.hpp"
+#include "../allocators/pinnable_block_allocator.hpp"
+
 #include "infinicore/context/context.hpp"

+#include "../../graph/graph_manager.hpp"
+
 #include <infiniop.h>
 #include <infinirt.h>

@@ -13,8 +16,9 @@ private:
    Device device_;
    infinirtStream_t stream_;
    infiniopHandle_t infiniop_handle_;
-    std::unique_ptr<MemoryAllocator> device_memory_allocator_;
+    std::unique_ptr<PinnableBlockAllocator> device_memory_allocator_;
    std::unique_ptr<MemoryAllocator> pinned_host_memory_allocator_;
+    std::unique_ptr<graph::GraphManager> graph_manager_;

 protected:
    Runtime(Device device);
@@ -33,6 +37,7 @@ public:

    std::shared_ptr<Memory> allocateMemory(size_t size);
    std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
+    std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);

    void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
    void memcpyD2H(void *dst, const void *src, size_t size);
@@ -48,6 +53,12 @@ public:
    float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
    void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);

+    // Graph
+    bool isGraphRecording() const;
+    void startGraphRecording();
+    void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
+    std::shared_ptr<graph::Graph> stopGraphRecording();
+
    std::string toString() const;

    friend class ContextImpl;

--- a/src/infinicore/device.cc
+++ b/src/infinicore/device.cc
@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) {
        return "KUNLUN";
    case Type::HYGON:
        return "HYGON";
+    case Type::ALI:
+        return "ALI";
    case Type::COUNT:
        return "COUNT";
    default:

--- a/src/infinicore/graph/graph.cc
+++ b/src/infinicore/graph/graph.cc
+#include "graph_manager.hpp"
+
+#include "../utils.hpp"
+#include "infinicore/context/context.hpp"
+#include <infinirt.h>
+
+namespace infinicore::graph {
+
+/* =========================
+ * GraphTensor
+ * ========================= */
+
+GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
+}
+
+/* =========================
+ * GraphOperator
+ * ========================= */
+
+void DispatchableGraphOperator::run() const {
+    runner_(planned_meta_);
+}
+
+DispatchableGraphOperator::~DispatchableGraphOperator() {
+    if (deleter_) {
+        deleter_(&planned_meta_);
+    }
+}
+
+/* =========================
+ * Graph
+ * ========================= */
+
+struct Graph::DeviceGraph {
+    infinirtGraph_t graph;
+    infinirtGraphExec_t exec;
+    infinirtGraphNode_t node;
+    std::vector<char> log_buffer;
+
+    DeviceGraph() {
+        log_buffer.resize(4 * 1024);
+    }
+
+    ~DeviceGraph() {
+        if (exec) {
+            infinirtGraphExecDestroy(exec);
+        }
+        if (graph) {
+            infinirtGraphDestroy(graph);
+        }
+    }
+
+    void launch() {
+        INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
+    }
+};
+
+Graph::Graph() {
+}
+
+void Graph::run() const {
+    if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
+        device_graph_.get()->launch();
+    } else {
+        for (auto &op : op_list_) {
+            op->run();
+        }
+    }
+}
+
+void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
+    op_list_.push_back(op);
+}
+
+void Graph::instantiate() {
+    // Reset device graph
+    device_graph_ = std::make_unique<DeviceGraph>();
+
+    // warmup
+    for (size_t iter = 0; iter < 5; ++iter) {
+        this->run();
+    }
+    infinicore::context::syncStream();
+
+    if (infinirtStreamBeginCapture(
+            context::getStream(),
+            INFINIRT_STREAM_CAPTURE_MODE_RELAXED)
+        != INFINI_STATUS_SUCCESS) {
+        return;
+    }
+
+    // Run and record
+    this->run();
+
+    if (infinirtStreamEndCapture(
+            context::getStream(),
+            &device_graph_.get()->graph)
+        != INFINI_STATUS_SUCCESS) {
+        return;
+    }
+
+    if (infinirtGraphInstantiate(
+            &device_graph_.get()->exec,
+            device_graph_.get()->graph,
+            &device_graph_.get()->node,
+            device_graph_.get()->log_buffer.data(),
+            device_graph_.get()->log_buffer.size())
+        != INFINI_STATUS_SUCCESS) {
+        static bool warned_once = false;
+        if (!warned_once) {
+            warned_once = true;
+            spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
+        }
+    }
+}
+
+Graph::~Graph() = default;
+
+/* =========================
+ * GraphManager
+ * ========================= */
+
+bool GraphManager::is_recording() const {
+    return recording_;
+}
+
+void GraphManager::start_recording() {
+    if (is_recording()) {
+        spdlog::warn("Graph is already recording. Previous recording will be dropped.");
+    }
+    recording_ = true;
+    graph_ = std::make_shared<Graph>();
+}
+
+void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
+    INFINICORE_ASSERT(is_recording());
+
+    graph_->add_operator(op);
+}
+
+std::shared_ptr<Graph> GraphManager::stop_recording() {
+    if (!is_recording()) {
+        spdlog::warn("Graph is not recording. Please start recording first.");
+        return nullptr;
+    }
+    recording_ = false;
+#ifdef USE_INFINIRT_GRAPH
+    graph_->instantiate();
+#endif
+    return std::exchange(graph_, nullptr);
+}
+
+} // namespace infinicore::graph
--- a/src/infinicore/graph/graph_manager.hpp
+++ b/src/infinicore/graph/graph_manager.hpp
+#pragma once
+
+#include "infinicore/graph/graph.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace infinicore::graph {
+
+class GraphManager {
+public:
+    GraphManager() = default;
+    ~GraphManager() = default;
+
+    bool is_recording() const;
+    void start_recording();
+    void add_operator(std::shared_ptr<GraphOperator> op);
+    std::shared_ptr<Graph> stop_recording();
+
+private:
+    std::shared_ptr<Graph> graph_;
+    bool recording_ = false;
+};
+
+} // namespace infinicore::graph
--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings,
 }

 Tensor Embedding::forward(const Tensor &indices) const {
+    // TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach
+    auto device_type = device_.getType();
+    if (device_type == Device::Type::NVIDIA || device_type == Device::Type::ILUVATAR || device_type == Device::Type::METAX || device_type == Device::Type::MOORE || device_type == Device::Type::ALI) {
+        // Use op::embedding which supports device-side input and batch dimension
+        return op::embedding(indices->contiguous()->to(device_), weight_);
+    }
+
    // Get the shape of indices
    auto indices_shape = indices->shape();


--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
 #include "infinicore/nn/linear.hpp"
 #include "../utils.hpp"
 #include "infinicore/ops.hpp"
+#include "infinicore/ops/distributed/allreduce.hpp"
 #include "infinicore/ops/linear.hpp"
+#include "infinicore/ops/linear_w8a8i8.hpp"
 #include <optional>
 #include <spdlog/spdlog.h>

@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
    device_ = device;
 }

-Tensor BaseLinear::compute_linear(Tensor &input) const {
+BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                       const DataType &dtype, const Device &device)
+    : in_features_(in_features),
+      out_features_(out_features),
+      quantization_(quantization),
+      has_bias_(bias),
+      dtype_(dtype) {
+
+    device_ = device;
+}

-    // Ensure input is contiguous before creating views (required for matmul)
-    // This prevents hanging when input tensor has non-contiguous memory layout
-    Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
+Tensor BaseLinear::compute_linear(Tensor &input) const {
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();

-    // Use ops::linear_ directly to match Python backend's exact code path
-    // This ensures identical computation and numerical results
-    // Parameter inherits from Tensor, so we cast to Tensor explicitly
-    Tensor weight_tensor = static_cast<const Tensor &>(weight_);
-    std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+        Tensor weight_packed_tensor = static_cast<const Tensor &>(weight_);
+        Tensor weight_scale_tensor = static_cast<const Tensor &>(weight_scale_);
+        // weight_packed should be transposed and non-contiguous.
+        std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;

-    auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
-    return output;
-}
+        auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt);
+        return output;
+    }
+    default: {
+        // Ensure input is contiguous before creating views (required for matmul)
+        // This prevents hanging when input tensor has non-contiguous memory layout
+        Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
+
+        // Use ops::linear_ directly to match Python backend's exact code path
+        // This ensures identical computation and numerical results
+        // Parameter inherits from Tensor, so we cast to Tensor explicitly
+        Tensor weight_tensor = static_cast<const Tensor &>(weight_);
+        std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+
+        auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
+        return output;
+    }
+    }
+} // namespace infinicore::nn

 Tensor BaseLinear::forward(Tensor &input) const {
    return compute_linear(input);
@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
    //              in_features, out_features, bias, static_cast<int>(dtype_));
 }

+Linear::Linear(size_t in_features, size_t out_features,
+               std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+               const DataType &dtype, const Device &device)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
+
+        // Register bias parameter if requested
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+
+        // SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
+        //              in_features, out_features, bias, static_cast<int>(dtype_));
+        break;
+    }
+    }
+}
+
 Tensor Linear::forward(Tensor &input) const {
    return BaseLinear::forward(input);
 }
@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
    } else {
        bias_ = Parameter(); // Default constructed empty parameter
    }
+}

-    // SPDLOG_DEBUG("Created ColumnParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
-    //              in_features, out_features, bias, static_cast<int>(dtype_));
+ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                                           const DataType &dtype, const Device &device,
+                                           Size tp_rank, Size tp_size)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
+      tp_rank_(tp_rank),
+      tp_size_(tp_size) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
+                                              0, tp_rank_, tp_size_));
+
+        // Register bias parameter if requested
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device,
+                                                0, tp_rank_, tp_size_));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+        break;
+    }
+    }
 }

 Tensor ColumnParallelLinear::forward(Tensor &input) const {
@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
    } else {
        bias_ = Parameter(); // Default constructed empty parameter
    }
+}

-    // SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
-    //              in_features, out_features, bias, static_cast<int>(dtype_));
+RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                                     const DataType &dtype, const Device &device,
+                                     Size tp_rank, Size tp_size, infinicclComm_t communicator)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
+      tp_rank_(tp_rank),
+      tp_size_(tp_size), communicator_(communicator) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
+                                              1, tp_rank_, tp_size_));
+
+        // Register bias parameter if requested
+        if (bias && (0 == tp_rank_)) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+
+        // SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
+        //              in_features, out_features, bias, static_cast<int>(dtype_));
+        break;
+    }
+    }
 }

 Tensor RowParallelLinear::forward(Tensor &input) const {
    auto output = BaseLinear::forward(input);

    if ((tp_size_ > 1) && (communicator_ != nullptr)) {
-
-        Size count = output->numel();
-        DataType type = output->dtype();
-
-        infinirtStream_t stream = infinicore::context::getStream();
-
-        INFINICORE_CHECK_ERROR(infinicclAllReduce(output->data(), output->data(), count, static_cast<infiniDtype_t>(static_cast<int>(type)),
-                                                  INFINICCL_SUM, communicator_, stream));
-        INFINICORE_CHECK_ERROR(infinirtStreamSynchronize(stream));
-
-        // RUN_INFINI(infinirtStreamSynchronize(stream));
+        op::distributed::allreduce_(output, output, INFINICCL_SUM, communicator_);
    }
    return output;
 }

--- a/src/infinicore/nn/rmsnorm.cc
+++ b/src/infinicore/nn/rmsnorm.cc
@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const {
    return op::rms_norm(x, weight_, static_cast<float>(eps_));
 }

+void RMSNorm::forward_inplace(Tensor &x, Tensor &residual) const {
+    if (!residual) {
+        residual = x;
+        x = op::rms_norm(x, weight_, static_cast<float>(eps_));
+    } else {
+        if (device_.getType() == Device::Type::CPU
+            || device_.getType() == Device::Type::NVIDIA
+            || device_.getType() == Device::Type::ILUVATAR
+            || device_.getType() == Device::Type::METAX
+            || device_.getType() == Device::Type::MOORE
+            || device_.getType() == Device::Type::ALI) {
+            op::add_rms_norm_inplace(x, residual, weight_, static_cast<float>(eps_));
+        } else {
+            op::add_(residual, x, residual);
+            op::rms_norm_(x, residual, weight_, static_cast<float>(eps_));
+        }
+    }
+}
+
 std::string RMSNorm::extra_repr() const {
    return "RMSNorm(normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
 }

--- a/src/infinicore/nn/rope.cc
+++ b/src/infinicore/nn/rope.cc
@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim,
           double theta,
           Algo algo,
           const DataType &dtype,
-           const Device &device)
+           const Device &device,
+           std::shared_ptr<ScalingConfig> scaling)
    : head_dim_(head_dim),
      max_seq_len_(max_seq_len),
      theta_(theta),
      algo_(algo),
-      dtype_(dtype) {
+      dtype_(dtype),
+      scaling_(scaling) {
    if (head_dim % 2 != 0) {
        throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
    }
@@ -54,14 +56,30 @@ void RoPE::initialize_cache() {
        for (size_t j = 0; j < cache_dim; j++) {
            // GPT-J style inverse frequency: theta^(-2j/head_dim)
            // Compute directly in float to avoid double->float casting
-            float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
+            float inv_freq;
+            float table_factor = 1.0f;
+            if (scaling_ == nullptr) {
+                inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
+            } else if (scaling_->type() == ScalingType::LONGROPE) {
+                std::shared_ptr<LongRopeConfig> lr = std::dynamic_pointer_cast<LongRopeConfig>(scaling_);
+                table_factor = lr->factor();
+                float _ext;
+                if (pos < lr->original_max_position_embeddings()) {
+                    _ext = lr->short_factor()[j];
+                } else {
+                    _ext = lr->long_factor()[j];
+                }
+                inv_freq = 1.0f / (_ext * std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_)));
+            } else {
+                inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
+            }

            // Compute angle: position * inverse_frequency
            float angle = static_cast<float>(pos) * inv_freq;

            // Compute sin and cos directly on float
-            sin_data[pos * cache_dim + j] = std::sin(angle);
-            cos_data[pos * cache_dim + j] = std::cos(angle);
+            sin_data[pos * cache_dim + j] = std::sin(angle) * table_factor;
+            cos_data[pos * cache_dim + j] = std::cos(angle) * table_factor;
        }
    }


--- a/src/infinicore/ops/add/add.cc
+++ b/src/infinicore/ops/add/add.cc
@@ -3,24 +3,24 @@

 namespace infinicore::op {

-common::OpDispatcher<Add::schema> &Add::dispatcher() {
-    static common::OpDispatcher<Add::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Add);

-void Add::execute(Tensor c, Tensor a, Tensor b) {
+Add::Add(Tensor c, const Tensor &a, const Tensor &b) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
 }

-Tensor add(Tensor a, Tensor b) {
+void Add::execute(Tensor c, const Tensor &a, const Tensor &b) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Add, c, a, b);
+}
+
+Tensor add(const Tensor &a, const Tensor &b) {
    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
    add_(c, a, b);
    return c;
 }

-void add_(Tensor c, Tensor a, Tensor b) {
+void add_(Tensor c, const Tensor &a, const Tensor &b) {
    Add::execute(c, a, b);
 }


--- a/src/infinicore/ops/add/add_infiniop.cc
+++ b/src/infinicore/ops/add/add_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
 #include "infinicore/ops/add.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::add_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopAddDescriptor_t> caches(
-    100, // capacity
-    [](infiniopAddDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAddDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Add, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a, b;
+};

-void calculate(Tensor c, Tensor a, Tensor b) {
+void *plan(Tensor c, const Tensor &a, const Tensor &b) {
    size_t seed = hash_combine(c, b, a);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Add,
+        seed,
+        c->desc(), a->desc(), b->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopAddDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, Add, descriptor);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAddDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAddWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopAdd(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->c->data(),
+        planned->a->data(),
+        planned->b->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    Add::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Add, &plan, &run, &cleanup);

 } // namespace infinicore::op::add_impl::infiniop
--- a/src/infinicore/ops/add_rms_norm/add_rms_norm.cc
+++ b/src/infinicore/ops/add_rms_norm/add_rms_norm.cc
+#include "infinicore/ops/add_rms_norm.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(AddRMSNorm);
+
+AddRMSNorm::AddRMSNorm(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, residual_out, a, b, weight);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, residual_out, a, b, weight, epsilon);
+}
+
+void AddRMSNorm::execute(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(AddRMSNorm, y, residual_out, a, b, weight, epsilon);
+}
+
+std::pair<Tensor, Tensor> add_rms_norm(const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
+    auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
+    auto residual_out = Tensor::empty(a->shape(), a->dtype(), a->device());
+    add_rms_norm_(y, residual_out, a, b, weight, epsilon);
+    return std::make_pair(y, residual_out);
+}
+
+void add_rms_norm_(Tensor out, Tensor residual, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
+    AddRMSNorm::execute(out, residual, a, b, weight, epsilon);
+}
+
+void add_rms_norm_inplace(Tensor input, Tensor residual, const Tensor &weight, float epsilon) {
+    add_rms_norm_(input, residual, input, residual, weight, epsilon);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc
+++ b/src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc
+#include "infinicore/ops/add_rms_norm.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::add_rms_norm_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, AddRMSNorm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, out, residual, a, b, weight;
+    float epsilon;
+};
+
+void *plan(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
+    size_t seed = hash_combine(y, residual_out, a, b, weight, epsilon);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, AddRMSNorm,
+        seed, y->desc(), residual_out->desc(),
+        a->desc(), b->desc(), weight->desc(), epsilon);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, AddRMSNorm, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(y),
+        graph::GraphTensor(residual_out),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b),
+        graph::GraphTensor(weight),
+        epsilon};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopAddRMSNorm(
+        planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
+        planned->out->data(), planned->residual->data(), planned->a->data(), planned->b->data(), planned->weight->data(), context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(AddRMSNorm, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::add_rms_norm_impl::infiniop
--- a/src/infinicore/ops/causal_softmax/causal_softmax.cc
+++ b/src/infinicore/ops/causal_softmax/causal_softmax.cc
 #include "infinicore/ops/causal_softmax.hpp"
-
 #include "../../utils.hpp"

-#include <stdexcept>
-
 namespace infinicore::op {

-common::OpDispatcher<CausalSoftmax::schema> &CausalSoftmax::dispatcher() {
-    static common::OpDispatcher<CausalSoftmax::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(CausalSoftmax);

-void CausalSoftmax::execute(Tensor output, Tensor input) {
+CausalSoftmax::CausalSoftmax(Tensor output, const Tensor &input) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No CausalSoftmax implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
+    INFINICORE_GRAPH_OP_DISPATCH(output->device().getType(), output, input);
+}

-    func(output, input);
+void CausalSoftmax::execute(Tensor output, const Tensor &input) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(CausalSoftmax, output, input);
 }

-Tensor causal_softmax(Tensor input) {
-    Shape shape = input->shape();
-    auto output = Tensor::empty(shape, input->dtype(), input->device());
+Tensor causal_softmax(const Tensor &input) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
    causal_softmax_(output, input);
    return output;
 }

-void causal_softmax_(Tensor output, Tensor input) {
+void causal_softmax_(Tensor output, const Tensor &input) {
    CausalSoftmax::execute(output, input);
 }
+
 } // namespace infinicore::op
--- a/src/infinicore/ops/causal_softmax/causal_softmax_infiniop.cc
+++ b/src/infinicore/ops/causal_softmax/causal_softmax_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
 #include "infinicore/ops/causal_softmax.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::causal_softmax_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopCausalSoftmaxDescriptor_t> caches(
-    100, // capacity
-    [](infiniopCausalSoftmaxDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyCausalSoftmaxDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, CausalSoftmax, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, output, input;
+};

-void calculate(Tensor output, Tensor input) {
+void *plan(Tensor output, const Tensor &input) {
    size_t seed = hash_combine(output, input);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, CausalSoftmax,
+        seed, output->desc(), input->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopCausalSoftmaxDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, CausalSoftmax, descriptor);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateCausalSoftmaxDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            output->desc(), input->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(output),
+        graph::GraphTensor(input)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetCausalSoftmaxWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopCausalSoftmax(
-        desc, workspace->data(), workspace_size,
-        output->data(), input->data(), context::getStream()));
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->output->data(),
+        planned->input->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    CausalSoftmax::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(CausalSoftmax, &plan, &run, &cleanup);

 } // namespace infinicore::op::causal_softmax_impl::infiniop
--- a/src/infinicore/ops/dequantize_awq/dequantize_awq.cc
+++ b/src/infinicore/ops/dequantize_awq/dequantize_awq.cc
+#include "infinicore/ops/dequantize_awq.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ);
+
+DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros);
+}
+
+void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros);
+}
+
+void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros);
+}
+} // namespace infinicore::op