issue/545 nn::module::Rope

Signed-off-by: Ceng23333 <441651826@qq.com>

issue/545 nn::module::Rope
Signed-off-by: Ceng23333 <441651826@qq.com>
289d4002 · Ceng23333 · 2e5b2342 · 289d4002 · 289d4002 · 289d4002
Commit 289d4002 authored Nov 07, 2025 by Ceng23333
13 changed files
--- a/include/infinicore/nn/module.hpp
+++ b/include/infinicore/nn/module.hpp
@@ -23,6 +23,8 @@ public:
 protected:
    Tensor register_parameter(const std::string &name, Parameter param);

+    Tensor register_buffer(const std::string &name, Parameter buffer);
+
    // Add an existing submodule to this module's hierarchy
    // Template parameter M must be a type derived from Module
    // Returns the submodule for convenience (allows method chaining)
@@ -72,6 +74,7 @@ protected:
 protected:
    Device device_;
    std::unordered_map<std::string, std::shared_ptr<Module>> submodules_;
+    std::unordered_map<std::string, Parameter> buffers_;
    std::unordered_map<std::string, Parameter> parameters_;

 private:
@@ -134,4 +137,15 @@ private:
    name##_ = infinicore::nn::Parameter args; \
    this->register_parameter(#name, name##_)

+// Declare a buffer member variable
+#define INFINICORE_NN_BUFFER(name) \
+    infinicore::nn::Parameter name##_
+
+// Initialize a buffer in constructor
+// Usage: INFINICORE_NN_BUFFER_INIT(name, (shape, dtype, device))
+// Example: INFINICORE_NN_BUFFER_INIT(cache, ({max_seq_len, head_dim}, DataType::F32, device))
+#define INFINICORE_NN_BUFFER_INIT(name, args) \
+    name##_ = infinicore::nn::Parameter args; \
+    this->register_buffer(#name, name##_)
+
 } // namespace infinicore::nn
--- a/include/infinicore/nn/rope.hpp
+++ b/include/infinicore/nn/rope.hpp
+#pragma once
+
+#include "module.hpp"
+#include "../context/context.hpp"
+#include "../tensor.hpp"
+#include <memory>
+
+namespace infinicore::nn {
+
+class RoPE : public Module {
+public:
+    /**
+     * @brief RoPE algorithm type
+     */
+    enum class Algo {
+        GPT_J = 0,    // GPT-J style RoPE algorithm (Interleave even and odd dimensions)
+        GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
+    };
+
+    /**
+     * @brief Construct a RoPE layer
+     *
+     * @param head_dim Dimension of each attention head (must be even)
+     * @param max_seq_len Maximum sequence length for pre-computed cache
+     * @param theta Base frequency for rotary embeddings (default: 10000.0)
+     * @param algo RoPE algorithm type (default: Algo::GPT_J)
+     * @param dtype Data type for sin/cos cache (default: DataType::F32)
+     * @param device Device to create the cache on
+     */
+    RoPE(size_t head_dim,
+         size_t max_seq_len,
+         double theta = 10000.0,
+         Algo algo = Algo::GPT_J,
+         const DataType &dtype = DataType::F32,
+         const Device &device = Device());
+
+    /**
+     * @brief Forward pass: apply RoPE to a tensor
+     *
+     * @param x Input tensor of shape (..., head_dim) where ... is any number of dimensions
+     * @param pos Position IDs tensor of shape (*,) typically [seq_len] or [batch, seq_len]
+     * @return Rotated tensor with same shape as input
+     *
+     * Applies rotary position embeddings to the input tensor.
+     * For attention mechanisms, call this method separately for query and key tensors.
+     *
+     * Common input shapes:
+     *   - [batch, num_heads, seq_len, head_dim]
+     *   - [batch, seq_len, num_heads, head_dim]
+     *   - [seq_len, head_dim]
+     */
+    Tensor forward(const Tensor &x, const Tensor &pos) const;
+
+    // Module information
+    size_t head_dim() const { return head_dim_; }
+    size_t max_seq_len() const { return max_seq_len_; }
+    double theta() const { return theta_; }
+    Algo algo() const { return algo_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+protected:
+    // Buffers (sin and cos cache tables) - not exposed in state_dict
+    INFINICORE_NN_BUFFER(sin_cache);
+    INFINICORE_NN_BUFFER(cos_cache);
+
+private:
+    void initialize_cache();
+
+    size_t head_dim_;      // Dimension of each attention head
+    size_t max_seq_len_;   // Maximum sequence length
+    double theta_;         // Base frequency for rotary embeddings
+    Algo algo_;            // RoPE algorithm type
+    DataType dtype_;       // Data type for cache tables
+};
+
+} // namespace infinicore::nn
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -7,5 +7,6 @@
 #include "ops/ones.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
+#include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
--- a/include/infinicore/ops/rope.hpp
+++ b/include/infinicore/ops/rope.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "../tensor.hpp"
+#include "../nn/rope.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class RoPE {
+public:
+    using schema = void (*)(Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, infinicore::nn::RoPE::Algo);
+    static void execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// Internal function
+void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+
+// Public API that uses infinicore::nn::RoPE::Algo
+Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+} // namespace infinicore::op
--- a/src/infinicore-test/main.cc
+++ b/src/infinicore-test/main.cc
@@ -141,12 +141,8 @@ ParsedArgs parseArgs(int argc, char *argv[]) {

 int main(int argc, char *argv[]) {
    try {
-        // Initialize spdlog for debugging
-        spdlog::set_level(spdlog::level::debug);
-        spdlog::info("Starting InfiniCore Memory Management Test Suite");
-
        ParsedArgs args = parseArgs(argc, argv);
-        spdlog::debug("Arguments parsed successfully");
+        spdlog::info("Arguments parsed successfully");

        std::cout << "==============================================\n"
                  << "InfiniCore Memory Management Test Suite\n"
@@ -156,31 +152,25 @@ int main(int argc, char *argv[]) {
                  << "Iterations: " << args.iterations << "\n"
                  << "==============================================" << std::endl;

-        spdlog::debug("About to initialize InfiniCore context");
+        spdlog::info("About to initialize InfiniCore context");
        // Initialize InfiniCore context
        infinicore::context::setDevice(infinicore::Device(static_cast<infinicore::Device::Type>(args.device_type), 0));
-        spdlog::debug("InfiniCore context initialized successfully");
+        spdlog::info("InfiniCore context initialized successfully");

-        spdlog::debug("Creating test runner");
+        spdlog::info("Creating test runner");
        // Create test runner
        infinicore::test::InfiniCoreTestRunner runner;
-        spdlog::debug("Test runner created successfully");
+        spdlog::info("Test runner created successfully");

        // Add tests based on arguments
        if (args.run_basic) {
-            spdlog::debug("Adding BasicMemoryTest");
            runner.addTest(std::make_unique<infinicore::test::BasicMemoryTest>());
-            spdlog::debug("BasicMemoryTest added successfully");

-            spdlog::debug("Adding TensorDestructorTest");
            runner.addTest(std::make_unique<infinicore::test::TensorDestructorTest>());
-            spdlog::debug("TensorDestructorTest added successfully");
        }

        if (args.run_module) {
-            spdlog::debug("Adding NNModuleTest");
            runner.addTest(std::make_unique<infinicore::test::NNModuleTest>());
-            spdlog::debug("NNModuleTest added successfully");
        }

        if (args.run_concurrency) {
@@ -203,10 +193,10 @@ int main(int argc, char *argv[]) {
            runner.addTest(std::make_unique<infinicore::test::StressTest>());
        }

-        spdlog::debug("About to run all tests");
+        spdlog::info("About to run all tests");
        // Run all tests
        auto results = runner.runAllTests();
-        spdlog::debug("All tests completed");
+        spdlog::info("All tests completed");

        // Count results and collect failed tests
        size_t passed = 0, failed = 0;

--- a/src/infinicore-test/test_nn_module.cc
+++ b/src/infinicore-test/test_nn_module.cc
--- a/src/infinicore-test/test_nn_module.h
+++ b/src/infinicore-test/test_nn_module.h
@@ -6,6 +6,7 @@
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/parameter.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/nn/rope.hpp"
 #include "test_runner.h"
 #include <algorithm>
 #include <cmath>
@@ -82,6 +83,7 @@ private:
    TestResult testModuleLinear();          // Comprehensive Linear module test
    TestResult testModuleEmbedding();       // Embedding module test
    TestResult testModuleRMSNorm();         // RMSNorm module test
+    TestResult testModuleRoPE();            // RoPE module test
    TestResult testDtypeAssertion();        // Test dtype assertions when loading parameters
    TestResult testTinyLlamaConstruction(); // Comprehensive: construction + weight loading + validation
 };

--- a/src/infinicore/nn/module.cc
+++ b/src/infinicore/nn/module.cc
@@ -55,6 +55,11 @@ Tensor Module::register_parameter(const std::string &name, Parameter param) {
    return param;
 }

+Tensor Module::register_buffer(const std::string &name, Parameter buffer) {
+    buffers_[name] = buffer;
+    return buffer;
+}
+
 void Module::collect_all_parameters(std::unordered_map<std::string, Parameter> &all_params, const std::string &prefix) const {
    // Add direct parameters with the given prefix
    for (const auto &[param_name, param] : parameters_) {

--- a/src/infinicore/nn/rmsnorm.cc
+++ b/src/infinicore/nn/rmsnorm.cc
@@ -25,15 +25,8 @@ RMSNorm::RMSNorm(size_t normalized_shape, double eps, const DataType &dtype, con
 }

 Tensor RMSNorm::forward(const Tensor &x) const {
-    // Validate input shape - last dimension should match normalized_shape
-    auto input_shape = x->shape();
-    if (input_shape.empty() || input_shape.back() != normalized_shape_) {
-        throw std::invalid_argument(
-            "Input last dimension " + std::to_string(input_shape.back()) + " doesn't match normalized_shape " + std::to_string(normalized_shape_));
-    }
-
    // Delegate to InfiniCore op (backed by InfiniRT/InfiniOP)
-    // y = RMSNorm(x, weight, eps)
+    // Validation is handled by the op layer
    return op::rms_norm(x, weight_, static_cast<float>(eps_));
 }


--- a/src/infinicore/nn/rope.cc
+++ b/src/infinicore/nn/rope.cc
+#include "infinicore/nn/rope.hpp"
+#include "../utils.hpp"
+#include "infinicore/ops.hpp"
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <spdlog/spdlog.h>
+#include <stdexcept>
+
+namespace infinicore::nn {
+
+RoPE::RoPE(size_t head_dim,
+           size_t max_seq_len,
+           double theta,
+           Algo algo,
+           const DataType &dtype,
+           const Device &device)
+    : head_dim_(head_dim),
+      max_seq_len_(max_seq_len),
+      theta_(theta),
+      algo_(algo),
+      dtype_(dtype) {
+
+    if (head_dim % 2 != 0) {
+        throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
+    }
+
+    device_ = device;
+
+    // Initialize cache tables
+    initialize_cache();
+
+    spdlog::debug("Created RoPE module: head_dim={}, max_seq_len={}, theta={}, algo={}, dtype={}",
+                  head_dim, max_seq_len, theta, static_cast<int>(algo), static_cast<int>(dtype_));
+}
+
+void RoPE::initialize_cache() {
+    size_t cache_dim = head_dim_ / 2;
+
+    // Create sin and cos cache tables: [max_seq_len, cache_dim]
+    INFINICORE_NN_BUFFER_INIT(sin_cache, ({max_seq_len_, cache_dim}, dtype_, device_));
+    INFINICORE_NN_BUFFER_INIT(cos_cache, ({max_seq_len_, cache_dim}, dtype_, device_));
+
+    // Pre-compute sin and cos values
+    // The frequency calculation differs based on algorithm:
+    // - GPT_J: pairs are (2j, 2j+1) for cache entry j, frequency for dimension 2j is theta^(-2j/head_dim)
+    // - GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j, frequency for dimension j is theta^(-j/head_dim)
+
+    // Compute on CPU first, then copy to device
+    auto cpu_device = Device(Device::Type::CPU, 0);
+
+    // Allocate CPU buffers
+    std::vector<float> sin_data(max_seq_len_ * cache_dim);
+    std::vector<float> cos_data(max_seq_len_ * cache_dim);
+
+    for (size_t pos = 0; pos < max_seq_len_; pos++) {
+        for (size_t j = 0; j < cache_dim; j++) {
+            // Compute inverse frequency based on algorithm
+            double inv_freq;
+
+            if (algo_ == Algo::GPT_J) {
+                // GPT_J: pairs are (2j, 2j+1) for cache entry j
+                // Frequency for pair j: theta^(-2j/head_dim)
+                inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
+            } else if (algo_ == Algo::GPT_NEOX) {
+                // GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j
+                // Frequency for pair j (corresponding to dimension j): theta^(-j/head_dim)
+                inv_freq = 1.0 / std::pow(theta_, static_cast<double>(j) / static_cast<double>(head_dim_));
+            } else {
+                throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo_)));
+            }
+
+            // Compute angle: position * inverse_frequency
+            double angle = static_cast<double>(pos) * inv_freq;
+
+            // Compute sin and cos
+            sin_data[pos * cache_dim + j] = static_cast<float>(std::sin(angle));
+            cos_data[pos * cache_dim + j] = static_cast<float>(std::cos(angle));
+        }
+    }
+
+    // Create CPU tensors and copy data
+    auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
+    auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
+
+    // Copy to device
+    // Note: Cache is created with dtype_, but we compute in F32 for precision.
+    // If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
+    // TODO: Add dtype conversion support when cast operation is available
+    if (dtype_ != DataType::F32) {
+        throw std::runtime_error(
+            "RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
+            "Requested dtype: "
+            + std::to_string(static_cast<int>(dtype_)));
+    }
+
+    // copy_from handles cross-device copying automatically
+    // Direct copy from CPU to target device avoids double copying
+    sin_cache_->copy_from(sin_cpu);
+    cos_cache_->copy_from(cos_cpu);
+}
+
+Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
+    // Delegate to InfiniCore op (backed by InfiniRT/InfiniOP)
+    // Validation is handled by the op layer
+    return op::rope(x, pos, sin_cache_, cos_cache_, algo_);
+}
+
+std::string RoPE::extra_repr() const {
+    std::string algo_str = (algo_ == Algo::GPT_J) ? "GPT_J" : "GPT_NEOX";
+    return "RoPE(head_dim=" + std::to_string(head_dim_) + ", max_seq_len=" + std::to_string(max_seq_len_) + ", theta=" + std::to_string(theta_) + ", algo=" + algo_str + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
+}
+
+} // namespace infinicore::nn
--- a/src/infinicore/ops/rope/rope.cc
+++ b/src/infinicore/ops/rope/rope.cc
+#include "infinicore/ops/rope.hpp"
+#include "infinicore/context/context.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
+    static common::OpDispatcher<RoPE::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(x_out, x, pos, sin_cache, cos_cache, algo);
+}
+
+void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+    RoPE::execute(x_out, x, pos, sin_cache, cos_cache, algo);
+}
+
+Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+    Shape shape = x->shape();
+    auto x_out = Tensor::empty(shape, x->dtype(), x->device());
+    rope_(x_out, x, pos, sin_cache, cos_cache, algo);
+    return x_out;
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/rope/rope_infiniop.cc
+++ b/src/infinicore/ops/rope/rope_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rope.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rope_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRoPEDescriptor_t> caches(
+    100, // capacity
+    [](infiniopRoPEDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRoPEDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+    // Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
+    infiniopRoPEAlgo_t infiniop_algo;
+    switch (algo) {
+    case infinicore::nn::RoPE::Algo::GPT_J:
+        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_J;
+        break;
+    case infinicore::nn::RoPE::Algo::GPT_NEOX:
+        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_NEOX;
+        break;
+    default:
+        throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo)));
+    }
+
+    // Create hash key for descriptor caching
+    size_t key = hash_combine(x_out, x, pos, sin_cache, cos_cache);
+    hash_combine(key, std::hash<int>()(static_cast<int>(infiniop_algo)));
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(key);
+    infiniopRoPEDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRoPEDescriptor(
+            context::getInfiniopHandle(), &desc,
+            x_out->desc(), x->desc(),
+            pos->desc(), sin_cache->desc(), cos_cache->desc(),
+            infiniop_algo));
+        cache.put(key, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // InfiniOP reads from x and writes to x_out (handles copying internally)
+    INFINICORE_CHECK_ERROR(infiniopRoPE(
+        desc, workspace->data(), workspace_size,
+        x_out->data(), x->data(), pos->data(),
+        sin_cache->data(), cos_cache->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    RoPE::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::rope_impl::infiniop
--- a/src/infinicore/utils.hpp
+++ b/src/infinicore/utils.hpp
@@ -9,7 +9,7 @@
 inline struct SpdlogInitializer {
    SpdlogInitializer() {
        if (!std::getenv("INFINICORE_LOG_LEVEL")) {
-            spdlog::set_level(spdlog::level::off);
+            spdlog::set_level(spdlog::level::info);
        } else {
            spdlog::cfg::load_env_levels("INFINICORE_LOG_LEVEL");
        }
@@ -21,9 +21,9 @@ inline struct SpdlogInitializer {

 #define INFINICORE_CHECK_ERROR(call)                                                                         \
    do {                                                                                                     \
-        spdlog::info("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
+        spdlog::debug("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                    \
        infiniStatus_t ret = (call);                                                                         \
-        spdlog::info("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                      \
+        spdlog::debug("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
        if (ret != INFINI_STATUS_SUCCESS) {                                                                  \
            throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
        }                                                                                                    \