issue/634: InfiniCore 支持InfiniLM Llama模型适配 (#668)

* issue/634: InfiniCore 支持InfiniLM Llama模型适配 Signed-off-by: Ceng23333 <441651826@qq.com> * . Signed-off-by: Ceng23333 <441651826@qq.com> --------- Signed-off-by: Ceng23333 <441651826@qq.com>

issue/634: InfiniCore 支持InfiniLM Llama模型适配 (#668)
* issue/634: InfiniCore 支持InfiniLM Llama模型适配 Signed-off-by: Ceng23333 <441651826@qq.com> * . Signed-off-by: Ceng23333 <441651826@qq.com> --------- Signed-off-by: Ceng23333 <441651826@qq.com>
2f3f4076 · Ceng · GitHub · 1bafd1a6 · 2f3f4076 · 2f3f4076
Unverified Commit 2f3f4076 authored Dec 01, 2025 by Ceng Committed by GitHub Dec 01, 2025
15 changed files
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
 import contextlib

+import infinicore.context as context
 import infinicore.nn as nn

 # Import context functions
@@ -60,6 +61,7 @@ from infinicore.tensor import (

 __all__ = [
    # Modules.
+    "context",
    "nn",
    # Classes.
    "device",

--- a/python/infinicore/nn/functional/rope.py
+++ b/python/infinicore/nn/functional/rope.py
@@ -5,8 +5,8 @@ from infinicore.tensor import Tensor
 class RopeAlgo:
    r"""Different types of RoPE algorithms."""

-    GPT_J = _infinicore.Algo.GPT_J
-    GPT_NEOX = _infinicore.Algo.GPT_NEOX
+    GPT_J = _infinicore.RoPEAlgo.GPT_J
+    GPT_NEOX = _infinicore.RoPEAlgo.GPT_NEOX


 def rope(

--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -36,7 +36,7 @@ Embedding::Embedding(size_t num_embeddings,
        // This would require a slice operation
    }

-    spdlog::debug("Created Embedding module: num_embeddings={}, embedding_dim={}, dtype={}, padding_idx={}",
+    SPDLOG_DEBUG("Created Embedding module: num_embeddings={}, embedding_dim={}, dtype={}, padding_idx={}",
                 num_embeddings, embedding_dim, static_cast<int>(dtype_),
                 padding_idx_.has_value() ? std::to_string(padding_idx_.value()) : "None");
 }

--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
@@ -22,7 +22,7 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp
        bias_ = Parameter(); // Default constructed empty parameter
    }

-    spdlog::debug("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
+    SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
                 in_features, out_features, bias, static_cast<int>(dtype_));
 }


--- a/src/infinicore/nn/parameter.cc
+++ b/src/infinicore/nn/parameter.cc
@@ -19,7 +19,13 @@ Parameter::Parameter(
 void Parameter::load_blob(const void *data) {
    auto buffer = Tensor::empty(impl_->shape(), impl_->dtype(), Device(Device::Type::CPU, 0), true);
    std::memcpy(buffer->data(), data, buffer->nbytes());
+
+    // If parameter is on CPU, use direct memcpy; otherwise use H2D
+    if (impl_->device().getType() == Device::Type::CPU) {
+        infinicore::context::memcpyH2H(impl_->data(), buffer->data(), buffer->nbytes());
+    } else {
        infinicore::context::memcpyH2D(impl_->data(), buffer->data(), buffer->nbytes());
        infinicore::context::syncStream();
+    }
 }
 } // namespace infinicore::nn
--- a/src/infinicore/nn/rmsnorm.cc
+++ b/src/infinicore/nn/rmsnorm.cc
 #include "infinicore/nn/rmsnorm.hpp"
 #include "infinicore/ops.hpp"
 #include <cmath>
-#include <spdlog/spdlog.h>
 #include <stdexcept>

 namespace infinicore::nn {
@@ -19,9 +18,6 @@ RMSNorm::RMSNorm(size_t normalized_shape, double eps, const DataType &dtype, con
    // Initialize weight to ones (standard practice for RMSNorm)
    auto ones_tensor = Tensor::ones({normalized_shape}, dtype_, device);
    weight_->copy_from(ones_tensor);
-
-    spdlog::debug("Created RMSNorm module: normalized_shape={}, eps={}, dtype={}",
-                  normalized_shape, eps, static_cast<int>(dtype_));
 }

 Tensor RMSNorm::forward(const Tensor &x) const {

--- a/src/infinicore/nn/rope.cc
+++ b/src/infinicore/nn/rope.cc
@@ -4,7 +4,6 @@
 #include <algorithm>
 #include <cmath>
 #include <functional>
-#include <spdlog/spdlog.h>
 #include <stdexcept>

 namespace infinicore::nn {
@@ -20,7 +19,6 @@ RoPE::RoPE(size_t head_dim,
      theta_(theta),
      algo_(algo),
      dtype_(dtype) {
-
    if (head_dim % 2 != 0) {
        throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
    }
@@ -29,9 +27,6 @@ RoPE::RoPE(size_t head_dim,

    // Initialize cache tables
    initialize_cache();
-
-    spdlog::debug("Created RoPE module: head_dim={}, max_seq_len={}, theta={}, algo={}, dtype={}",
-                  head_dim, max_seq_len, theta, static_cast<int>(algo), static_cast<int>(dtype_));
 }

 void RoPE::initialize_cache() {
@@ -42,9 +37,8 @@ void RoPE::initialize_cache() {
    INFINICORE_NN_BUFFER_INIT(cos_cache, ({max_seq_len_, cache_dim}, dtype_, device_));

    // Pre-compute sin and cos values
-    // The frequency calculation differs based on algorithm:
-    // - GPT_J: pairs are (2j, 2j+1) for cache entry j, frequency for dimension 2j is theta^(-2j/head_dim)
-    // - GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j, frequency for dimension j is theta^(-j/head_dim)
+    // Frequency generation always uses GPT-J style (theta^(-2j/head_dim)).
+    // The rotation algorithm (algo_) controls how dimensions are paired in the kernel.

    // Compute on CPU first, then copy to device
    auto cpu_device = Device(Device::Type::CPU, 0);
@@ -55,20 +49,8 @@ void RoPE::initialize_cache() {

    for (size_t pos = 0; pos < max_seq_len_; pos++) {
        for (size_t j = 0; j < cache_dim; j++) {
-            // Compute inverse frequency based on algorithm
-            double inv_freq;
-
-            if (algo_ == Algo::GPT_J) {
-                // GPT_J: pairs are (2j, 2j+1) for cache entry j
-                // Frequency for pair j: theta^(-2j/head_dim)
-                inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
-            } else if (algo_ == Algo::GPT_NEOX) {
-                // GPT_NEOX: pairs are (j, j+head_dim/2) for cache entry j
-                // Frequency for pair j (corresponding to dimension j): theta^(-j/head_dim)
-                inv_freq = 1.0 / std::pow(theta_, static_cast<double>(j) / static_cast<double>(head_dim_));
-            } else {
-                throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo_)));
-            }
+            // GPT-J style inverse frequency: theta^(-2j/head_dim)
+            double inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));

            // Compute angle: position * inverse_frequency
            double angle = static_cast<double>(pos) * inv_freq;

--- a/src/infinicore/pybind11/context.hpp
+++ b/src/infinicore/pybind11/context.hpp
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

+#include "../utils.hpp"
 #include "context.hpp"
 #include "device.hpp"
 #include "device_event.hpp"
 #include "dtype.hpp"
+#include "nn.hpp"
 #include "ops.hpp"
 #include "tensor.hpp"

@@ -17,6 +19,7 @@ PYBIND11_MODULE(_infinicore, m) {
    dtype::bind(m);
    ops::bind(m);
    tensor::bind(m);
+    pybind11_nn::bind(m);
 }

 } // namespace infinicore
--- a/src/infinicore/pybind11/nn.hpp
+++ b/src/infinicore/pybind11/nn.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "nn/rope.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::pybind11_nn {
+
+inline void bind(py::module &m) {
+    bind_rope(m);
+}
+
+} // namespace infinicore::pybind11_nn
--- a/src/infinicore/pybind11/ops/rope.hpp
+++ b/src/infinicore/pybind11/ops/rope.hpp
@@ -9,11 +9,6 @@ namespace py = pybind11;
 namespace infinicore::ops {

 inline void bind_rope(py::module &m) {
-
-    py::enum_<infinicore::nn::RoPE::Algo>(m, "Algo")
-        .value("GPT_J", infinicore::nn::RoPE::Algo::GPT_J)
-        .value("GPT_NEOX", infinicore::nn::RoPE::Algo::GPT_NEOX);
-
    m.def("rope",
          &op::rope,
          py::arg("x"),

--- a/src/infinicore/tensor/copy.cc
+++ b/src/infinicore/tensor/copy.cc
@@ -3,14 +3,15 @@
 #include "infinicore/ops.hpp"
 #include "infinicore/tensor.hpp"

-#include <spdlog/spdlog.h>
-
+#include <algorithm>
+#include <cstring>
+#include <iostream>
 namespace infinicore {
 Tensor TensorImpl::to(Device device) const {
    if (device == data_.memory->device()) {
        return Tensor(const_cast<TensorImpl *>(this)->shared_from_this());
    } else {
-        std::shared_ptr<TensorImpl> _t = empty(meta_.shape, meta_.dtype, device, true);
+        std::shared_ptr<TensorImpl> _t = empty(meta_.shape, meta_.dtype, device);
        _t->copy_from(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()));
        return Tensor(_t);
    }
@@ -20,26 +21,44 @@ void TensorImpl::copy_from(Tensor src) {
    if (src->shape() != this->shape()) {
        throw std::runtime_error("Cannot copy from tensor with different shape");
    }
-    if (this->device().getType() == src->device().getType()) {
+    if (this->device() == src->device()) {
+
+        // If both tensors are contiguous, use direct memcpy (much faster and avoids rearrange issues)
+        if (this->is_contiguous() && src->is_contiguous()) {
+            // Use nbytes() to get the actual tensor size
+            size_t copy_size = std::min(this->nbytes(), src->nbytes());
+
+            // For CPU-to-CPU copies, use regular memcpy. For device-to-device, use D2D memcpy
+            if (this->device().getType() == Device::Type::CPU) {
+                context::memcpyH2H(this->data(), src->data(), copy_size);
+            } else {
+                context::memcpyD2D(this->data(), src->data(), copy_size);
+            }
+        } else {
            op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), src);
+        }
    } else {
        if (!src->is_contiguous()) {
            src = src->contiguous();
        }
+
+        // Use nbytes() to get the actual tensor size, not the full memory size
+        size_t copy_size = std::min(this->nbytes(), src->nbytes());
        if (this->device().getType() == Device::Type::CPU) {
            if (this->is_contiguous()) {
-                context::memcpyD2H(this->data(), src->data(), this->data_.memory->size());
+                context::memcpyD2H(this->data(), src->data(), copy_size);
            } else {
                auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
                context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
                op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
            }
        } else if (src->device().getType() == Device::Type::CPU) {
+
            if (this->is_contiguous()) {
-                context::memcpyH2D(this->data(), src->data(), this->data_.memory->size());
+                context::memcpyH2D(this->data(), src->data(), copy_size);
            } else {
                auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
-                context::memcpyH2D(local_src->data(), src->data(), this->data_.memory->size());
+                context::memcpyH2D(local_src->data(), src->data(), copy_size);
                op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
            }
        }

--- a/src/infinicore/utils.hpp
+++ b/src/infinicore/utils.hpp
@@ -13,6 +13,10 @@ inline struct SpdlogInitializer {
        } else {
            spdlog::cfg::load_env_levels("INFINICORE_LOG_LEVEL");
        }
+        // Set pattern for logging
+        // Using SPDLOG_* macros enables source location support (%s and %#)
+        // Format: [timestamp] [level] [file:line] message
+        spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] [%s:%#] %v");
    }
 } spdlog_initializer;

@@ -21,9 +25,9 @@ inline struct SpdlogInitializer {

 #define INFINICORE_CHECK_ERROR(call)                                                                         \
    do {                                                                                                     \
-        spdlog::debug("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                    \
+        SPDLOG_DEBUG("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
        infiniStatus_t ret = (call);                                                                         \
-        spdlog::debug("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
+        SPDLOG_DEBUG("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                      \
        if (ret != INFINI_STATUS_SUCCESS) {                                                                  \
            throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
        }                                                                                                    \

--- a/xmake.lua
+++ b/xmake.lua
@@ -348,12 +348,45 @@ target("infiniccl")
    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 target_end()

+target("infinicore_c_api")
+
 target("infinicore_c_api")
    set_kind("phony")
    add_deps("infiniop", "infinirt", "infiniccl")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()

+target("infinicore_cpp_api")
+    set_kind("shared")
+    add_deps("infiniop", "infinirt", "infiniccl")
+    set_languages("cxx17")
+
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+    add_includedirs("include")
+    add_includedirs(INFINI_ROOT.."/include", { public = true })
+
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infiniop", "infinirt", "infiniccl")
+
+    -- Add InfiniCore C++ source files (needed for RoPE and other nn modules)
+    add_files("src/infinicore/*.cc")
+    add_files("src/infinicore/context/*.cc")
+    add_files("src/infinicore/context/*/*.cc")
+    add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/nn/*.cc")
+    add_files("src/infinicore/ops/*/*.cc")
+
+    set_installdir(INFINI_ROOT)
+    add_installfiles("include/infinicore/(**.h)",    {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**.hpp)",    {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**/*.h)",  {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**/*.hpp)",{prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore.h",          {prefixdir = "include"})
+    add_installfiles("include/infinicore.hpp",        {prefixdir = "include"})
+    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
+target_end()
+
 target("_infinicore")
    add_packages("boost")
    if is_mode("debug") then
@@ -379,6 +412,7 @@ target("_infinicore")
    add_files("src/infinicore/context/*.cc")
    add_files("src/infinicore/context/*/*.cc")
    add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/nn/*.cc")
    add_files("src/infinicore/ops/*/*.cc")
    add_files("src/infinicore/pybind11/**.cc")


--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -89,6 +89,7 @@ target("infinicore-test")
    add_files(os.projectdir().."/src/infinicore/nn/*.cc")

    add_files(os.projectdir().."/src/infinicore-test/*.cc")
+    add_files(os.projectdir().."/src/infinicore-test/*/*.cc")

    set_installdir(INFINI_ROOT)
 target_end()