Commit 289d4002 authored by Ceng23333's avatar Ceng23333
Browse files

issue/545 nn::module::Rope


Signed-off-by: default avatarCeng23333 <441651826@qq.com>
parent 2e5b2342
......@@ -23,6 +23,8 @@ public:
protected:
Tensor register_parameter(const std::string &name, Parameter param);
Tensor register_buffer(const std::string &name, Parameter buffer);
// Add an existing submodule to this module's hierarchy
// Template parameter M must be a type derived from Module
// Returns the submodule for convenience (allows method chaining)
......@@ -72,6 +74,7 @@ protected:
protected:
Device device_;
std::unordered_map<std::string, std::shared_ptr<Module>> submodules_;
std::unordered_map<std::string, Parameter> buffers_;
std::unordered_map<std::string, Parameter> parameters_;
private:
......@@ -134,4 +137,15 @@ private:
name##_ = infinicore::nn::Parameter args; \
this->register_parameter(#name, name##_)
// Declare a buffer member variable
#define INFINICORE_NN_BUFFER(name) \
infinicore::nn::Parameter name##_
// Initialize a buffer in constructor
// Usage: INFINICORE_NN_BUFFER_INIT(name, (shape, dtype, device))
// Example: INFINICORE_NN_BUFFER_INIT(cache, ({max_seq_len, head_dim}, DataType::F32, device))
#define INFINICORE_NN_BUFFER_INIT(name, args) \
name##_ = infinicore::nn::Parameter args; \
this->register_buffer(#name, name##_)
} // namespace infinicore::nn
#pragma once
#include "module.hpp"
#include "../context/context.hpp"
#include "../tensor.hpp"
#include <memory>
namespace infinicore::nn {
class RoPE : public Module {
public:
/**
* @brief RoPE algorithm type
*/
enum class Algo {
GPT_J = 0, // GPT-J style RoPE algorithm (Interleave even and odd dimensions)
GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
};
/**
* @brief Construct a RoPE layer
*
* @param head_dim Dimension of each attention head (must be even)
* @param max_seq_len Maximum sequence length for pre-computed cache
* @param theta Base frequency for rotary embeddings (default: 10000.0)
* @param algo RoPE algorithm type (default: Algo::GPT_J)
* @param dtype Data type for sin/cos cache (default: DataType::F32)
* @param device Device to create the cache on
*/
RoPE(size_t head_dim,
size_t max_seq_len,
double theta = 10000.0,
Algo algo = Algo::GPT_J,
const DataType &dtype = DataType::F32,
const Device &device = Device());
/**
* @brief Forward pass: apply RoPE to a tensor
*
* @param x Input tensor of shape (..., head_dim) where ... is any number of dimensions
* @param pos Position IDs tensor of shape (*,) typically [seq_len] or [batch, seq_len]
* @return Rotated tensor with same shape as input
*
* Applies rotary position embeddings to the input tensor.
* For attention mechanisms, call this method separately for query and key tensors.
*
* Common input shapes:
* - [batch, num_heads, seq_len, head_dim]
* - [batch, seq_len, num_heads, head_dim]
* - [seq_len, head_dim]
*/
Tensor forward(const Tensor &x, const Tensor &pos) const;
// Module information
size_t head_dim() const { return head_dim_; }
size_t max_seq_len() const { return max_seq_len_; }
double theta() const { return theta_; }
Algo algo() const { return algo_; }
DataType dtype() const { return dtype_; }
// String representation
std::string extra_repr() const;
protected:
// Buffers (sin and cos cache tables) - not exposed in state_dict
INFINICORE_NN_BUFFER(sin_cache);
INFINICORE_NN_BUFFER(cos_cache);
private:
void initialize_cache();
size_t head_dim_; // Dimension of each attention head
size_t max_seq_len_; // Maximum sequence length
double theta_; // Base frequency for rotary embeddings
Algo algo_; // RoPE algorithm type
DataType dtype_; // Data type for cache tables
};
} // namespace infinicore::nn
......@@ -7,5 +7,6 @@
#include "ops/ones.hpp"
#include "ops/rearrange.hpp"
#include "ops/rms_norm.hpp"
#include "ops/rope.hpp"
#include "ops/silu.hpp"
#include "ops/swiglu.hpp"
#pragma once
#include "../device.hpp"
#include "../tensor.hpp"
#include "../nn/rope.hpp"
#include "common/op.hpp"
namespace infinicore::op {
class RoPE {
public:
using schema = void (*)(Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, infinicore::nn::RoPE::Algo);
static void execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
static common::OpDispatcher<schema> &dispatcher();
};
// Internal function
void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
// Public API that uses infinicore::nn::RoPE::Algo
Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
} // namespace infinicore::op
......@@ -141,12 +141,8 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
try {
// Initialize spdlog for debugging
spdlog::set_level(spdlog::level::debug);
spdlog::info("Starting InfiniCore Memory Management Test Suite");
ParsedArgs args = parseArgs(argc, argv);
spdlog::debug("Arguments parsed successfully");
spdlog::info("Arguments parsed successfully");
std::cout << "==============================================\n"
<< "InfiniCore Memory Management Test Suite\n"
......@@ -156,31 +152,25 @@ int main(int argc, char *argv[]) {
<< "Iterations: " << args.iterations << "\n"
<< "==============================================" << std::endl;
spdlog::debug("About to initialize InfiniCore context");
spdlog::info("About to initialize InfiniCore context");
// Initialize InfiniCore context
infinicore::context::setDevice(infinicore::Device(static_cast<infinicore::Device::Type>(args.device_type), 0));
spdlog::debug("InfiniCore context initialized successfully");
spdlog::info("InfiniCore context initialized successfully");
spdlog::debug("Creating test runner");
spdlog::info("Creating test runner");
// Create test runner
infinicore::test::InfiniCoreTestRunner runner;
spdlog::debug("Test runner created successfully");
spdlog::info("Test runner created successfully");
// Add tests based on arguments
if (args.run_basic) {
spdlog::debug("Adding BasicMemoryTest");
runner.addTest(std::make_unique<infinicore::test::BasicMemoryTest>());
spdlog::debug("BasicMemoryTest added successfully");
spdlog::debug("Adding TensorDestructorTest");
runner.addTest(std::make_unique<infinicore::test::TensorDestructorTest>());
spdlog::debug("TensorDestructorTest added successfully");
}
if (args.run_module) {
spdlog::debug("Adding NNModuleTest");
runner.addTest(std::make_unique<infinicore::test::NNModuleTest>());
spdlog::debug("NNModuleTest added successfully");
}
if (args.run_concurrency) {
......@@ -203,10 +193,10 @@ int main(int argc, char *argv[]) {
runner.addTest(std::make_unique<infinicore::test::StressTest>());
}
spdlog::debug("About to run all tests");
spdlog::info("About to run all tests");
// Run all tests
auto results = runner.runAllTests();
spdlog::debug("All tests completed");
spdlog::info("All tests completed");
// Count results and collect failed tests
size_t passed = 0, failed = 0;
......
This diff is collapsed.
......@@ -6,6 +6,7 @@
#include "infinicore/nn/module.hpp"
#include "infinicore/nn/parameter.hpp"
#include "infinicore/nn/rmsnorm.hpp"
#include "infinicore/nn/rope.hpp"
#include "test_runner.h"
#include <algorithm>
#include <cmath>
......@@ -82,6 +83,7 @@ private:
TestResult testModuleLinear(); // Comprehensive Linear module test
TestResult testModuleEmbedding(); // Embedding module test
TestResult testModuleRMSNorm(); // RMSNorm module test
TestResult testModuleRoPE(); // RoPE module test
TestResult testDtypeAssertion(); // Test dtype assertions when loading parameters
TestResult testTinyLlamaConstruction(); // Comprehensive: construction + weight loading + validation
};
......
......@@ -55,6 +55,11 @@ Tensor Module::register_parameter(const std::string &name, Parameter param) {
return param;
}
Tensor Module::register_buffer(const std::string &name, Parameter buffer) {
buffers_[name] = buffer;
return buffer;
}
void Module::collect_all_parameters(std::unordered_map<std::string, Parameter> &all_params, const std::string &prefix) const {
// Add direct parameters with the given prefix
for (const auto &[param_name, param] : parameters_) {
......
......@@ -25,15 +25,8 @@ RMSNorm::RMSNorm(size_t normalized_shape, double eps, const DataType &dtype, con
}
Tensor RMSNorm::forward(const Tensor &x) const {
// Validate input shape - last dimension should match normalized_shape
auto input_shape = x->shape();
if (input_shape.empty() || input_shape.back() != normalized_shape_) {
throw std::invalid_argument(
"Input last dimension " + std::to_string(input_shape.back()) + " doesn't match normalized_shape " + std::to_string(normalized_shape_));
}
// Delegate to InfiniCore op (backed by InfiniRT/InfiniOP)
// y = RMSNorm(x, weight, eps)
// Validation is handled by the op layer
return op::rms_norm(x, weight_, static_cast<float>(eps_));
}
......
#include "infinicore/nn/rope.hpp"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include <algorithm>
#include <cmath>
#include <functional>
#include <spdlog/spdlog.h>
#include <stdexcept>
namespace infinicore::nn {
RoPE::RoPE(size_t head_dim,
size_t max_seq_len,
double theta,
Algo algo,
const DataType &dtype,
const Device &device)
: head_dim_(head_dim),
max_seq_len_(max_seq_len),
theta_(theta),
algo_(algo),
dtype_(dtype) {
if (head_dim % 2 != 0) {
throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
}
device_ = device;
// Initialize cache tables
initialize_cache();
spdlog::debug("Created RoPE module: head_dim={}, max_seq_len={}, theta={}, algo={}, dtype={}",
head_dim, max_seq_len, theta, static_cast<int>(algo), static_cast<int>(dtype_));
}
void RoPE::initialize_cache() {
size_t cache_dim = head_dim_ / 2;
// Create sin and cos cache tables: [max_seq_len, cache_dim]
INFINICORE_NN_BUFFER_INIT(sin_cache, ({max_seq_len_, cache_dim}, dtype_, device_));
INFINICORE_NN_BUFFER_INIT(cos_cache, ({max_seq_len_, cache_dim}, dtype_, device_));
// Pre-compute sin and cos values
// The frequency calculation differs based on algorithm:
// - GPT_J: pairs are (2j, 2j+1) for cache entry j, frequency for dimension 2j is theta^(-2j/head_dim)
// - GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j, frequency for dimension j is theta^(-j/head_dim)
// Compute on CPU first, then copy to device
auto cpu_device = Device(Device::Type::CPU, 0);
// Allocate CPU buffers
std::vector<float> sin_data(max_seq_len_ * cache_dim);
std::vector<float> cos_data(max_seq_len_ * cache_dim);
for (size_t pos = 0; pos < max_seq_len_; pos++) {
for (size_t j = 0; j < cache_dim; j++) {
// Compute inverse frequency based on algorithm
double inv_freq;
if (algo_ == Algo::GPT_J) {
// GPT_J: pairs are (2j, 2j+1) for cache entry j
// Frequency for pair j: theta^(-2j/head_dim)
inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
} else if (algo_ == Algo::GPT_NEOX) {
// GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j
// Frequency for pair j (corresponding to dimension j): theta^(-j/head_dim)
inv_freq = 1.0 / std::pow(theta_, static_cast<double>(j) / static_cast<double>(head_dim_));
} else {
throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo_)));
}
// Compute angle: position * inverse_frequency
double angle = static_cast<double>(pos) * inv_freq;
// Compute sin and cos
sin_data[pos * cache_dim + j] = static_cast<float>(std::sin(angle));
cos_data[pos * cache_dim + j] = static_cast<float>(std::cos(angle));
}
}
// Create CPU tensors and copy data
auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
// Copy to device
// Note: Cache is created with dtype_, but we compute in F32 for precision.
// If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
// TODO: Add dtype conversion support when cast operation is available
if (dtype_ != DataType::F32) {
throw std::runtime_error(
"RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
"Requested dtype: "
+ std::to_string(static_cast<int>(dtype_)));
}
// copy_from handles cross-device copying automatically
// Direct copy from CPU to target device avoids double copying
sin_cache_->copy_from(sin_cpu);
cos_cache_->copy_from(cos_cpu);
}
Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
// Delegate to InfiniCore op (backed by InfiniRT/InfiniOP)
// Validation is handled by the op layer
return op::rope(x, pos, sin_cache_, cos_cache_, algo_);
}
std::string RoPE::extra_repr() const {
std::string algo_str = (algo_ == Algo::GPT_J) ? "GPT_J" : "GPT_NEOX";
return "RoPE(head_dim=" + std::to_string(head_dim_) + ", max_seq_len=" + std::to_string(max_seq_len_) + ", theta=" + std::to_string(theta_) + ", algo=" + algo_str + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
}
} // namespace infinicore::nn
#include "infinicore/ops/rope.hpp"
#include "infinicore/context/context.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
static common::OpDispatcher<RoPE::schema> dispatcher_;
return dispatcher_;
};
void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
auto device_type = context::getDevice().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(x_out, x, pos, sin_cache, cos_cache, algo);
}
void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
RoPE::execute(x_out, x, pos, sin_cache, cos_cache, algo);
}
Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
Shape shape = x->shape();
auto x_out = Tensor::empty(shape, x->dtype(), x->device());
rope_(x_out, x, pos, sin_cache, cos_cache, algo);
return x_out;
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rope.hpp"
#include <infiniop.h>
namespace infinicore::op::rope_impl::infiniop {
thread_local common::OpCache<size_t, infiniopRoPEDescriptor_t> caches(
100, // capacity
[](infiniopRoPEDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyRoPEDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
// Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
infiniopRoPEAlgo_t infiniop_algo;
switch (algo) {
case infinicore::nn::RoPE::Algo::GPT_J:
infiniop_algo = INFINIOP_ROPE_ALGO_GPT_J;
break;
case infinicore::nn::RoPE::Algo::GPT_NEOX:
infiniop_algo = INFINIOP_ROPE_ALGO_GPT_NEOX;
break;
default:
throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo)));
}
// Create hash key for descriptor caching
size_t key = hash_combine(x_out, x, pos, sin_cache, cos_cache);
hash_combine(key, std::hash<int>()(static_cast<int>(infiniop_algo)));
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(key);
infiniopRoPEDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateRoPEDescriptor(
context::getInfiniopHandle(), &desc,
x_out->desc(), x->desc(),
pos->desc(), sin_cache->desc(), cos_cache->desc(),
infiniop_algo));
cache.put(key, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
// InfiniOP reads from x and writes to x_out (handles copying internally)
INFINICORE_CHECK_ERROR(infiniopRoPE(
desc, workspace->data(), workspace_size,
x_out->data(), x->data(), pos->data(),
sin_cache->data(), cos_cache->data(), context::getStream()));
}
static bool registered = []() {
RoPE::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::rope_impl::infiniop
......@@ -9,7 +9,7 @@
inline struct SpdlogInitializer {
SpdlogInitializer() {
if (!std::getenv("INFINICORE_LOG_LEVEL")) {
spdlog::set_level(spdlog::level::off);
spdlog::set_level(spdlog::level::info);
} else {
spdlog::cfg::load_env_levels("INFINICORE_LOG_LEVEL");
}
......@@ -21,9 +21,9 @@ inline struct SpdlogInitializer {
#define INFINICORE_CHECK_ERROR(call) \
do { \
spdlog::info("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`."); \
spdlog::debug("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`."); \
infiniStatus_t ret = (call); \
spdlog::info("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`."); \
spdlog::debug("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`."); \
if (ret != INFINI_STATUS_SUCCESS) { \
throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
} \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment