issue/95 将pybind target命名为_infinilm

38ac084d · PanZezhong · ae3ebe19 · 38ac084d · 38ac084d · 38ac084d
Commit 38ac084d authored Dec 03, 2025 by PanZezhong
9 changed files
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
 #pragma once

-#include "infinicore/tensor.hpp"
 #include "infinicore/device.hpp"
+#include "infinicore/tensor.hpp"
 #include <algorithm>
-#include <utility>
 #include <memory>
+#include <utility>

 namespace infinilm::cache {

@@ -18,11 +18,11 @@ namespace infinilm::cache {
 * that needs KV caching for attention mechanisms.
 */
 struct KVCache {
-    infinicore::Tensor k_cache;  // [n_kv_head, capacity, head_dim]
-    infinicore::Tensor v_cache;  // [n_kv_head, capacity, head_dim]
-    size_t cache_position;        // Current position in cache
-    size_t max_capacity;          // Maximum capacity of cache
-    bool initialized;             // Whether cache has been initialized
+    infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim]
+    infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim]
+    size_t cache_position;      // Current position in cache
+    size_t max_capacity;        // Maximum capacity of cache
+    bool initialized;           // Whether cache has been initialized

    KVCache()
        : cache_position(0), max_capacity(0), initialized(false),
@@ -41,12 +41,12 @@ struct KVCache {
     * @param device Device
     */
    void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
-                        infinicore::DataType dtype, const infinicore::Device &device) {
+                         infinicore::DataType dtype, const infinicore::Device &device) {
        size_t required_capacity = cache_position + seq_len;

        // Lazy initialization
        if (!initialized) {
-            max_capacity = std::max(required_capacity, size_t(4096));  // Start with at least 4096
+            max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
            k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
                                                dtype, device);
            v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
@@ -94,7 +94,7 @@ struct KVCache {

        // Ensure capacity
        ensure_capacity(num_kv_heads, head_dim, seq_len,
-                       k_new->dtype(), k_new->device());
+                        k_new->dtype(), k_new->device());

        // Copy new k/v into cache at current position
        auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
@@ -113,4 +113,4 @@ struct KVCache {
    }
 };

-} // namespace infinilm::models::common
+} // namespace infinilm::cache
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -3,17 +3,17 @@
 #include "infinicore/nn/rope.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/ops/mul.hpp"
-#include <spdlog/spdlog.h>
+#include <algorithm>
 #include <cmath>
 #include <cstring>
-#include <stdexcept>
 #include <iostream>
-#include <algorithm>
+#include <spdlog/spdlog.h>
+#include <stdexcept>

 namespace infinilm::models::llama {

 LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
-                                infinicore::DataType dtype)
+                               infinicore::DataType dtype)
    : hidden_size_(config.hidden_size),
      num_attention_heads_(config.num_attention_heads),
      num_key_value_heads_(config.num_key_value_heads),
@@ -22,19 +22,18 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Devi
      use_bias_(config.attention_bias) {
    // Initialize projection layers
    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
-                               dtype, device);
+                              dtype, device);
    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
-                               dtype, device);
+                              dtype, device);
    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
-                               dtype, device);
+                              dtype, device);
    INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
-                               dtype, device);
-
+                              dtype, device);
 }

 infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
-                                            const infinicore::Tensor &position_ids,
-                                            void *kv_cache) const {
+                                           const infinicore::Tensor &position_ids,
+                                           void *kv_cache) const {
    if (!rotary_emb_) {
        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
    }
@@ -45,12 +44,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
    size_t seq_len = shape[1];

    // 1. Project Q, K, V
-    auto q = q_proj_->forward(hidden_states_mutable);  // [batch, seq_len, hidden_size]
-
-    auto k = k_proj_->forward(hidden_states_mutable);  // [batch, seq_len, kv_dim]
+    auto q = q_proj_->forward(hidden_states_mutable); // [batch, seq_len, hidden_size]

-    auto v = v_proj_->forward(hidden_states_mutable);  // [batch, seq_len, kv_dim]
+    auto k = k_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]

+    auto v = v_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]

    // 2. Reshape for multi-head attention

@@ -84,7 +82,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
    auto q_for_rope = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_})->contiguous();
    auto k_for_rope = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_})->contiguous();

-
    // Call RoPE on full batch (matching Python pattern)
    auto q_rope_out = rotary_emb_->forward(q_for_rope, pos_ids_for_rope);
    auto k_rope_out = rotary_emb_->forward(k_for_rope, pos_ids_for_rope);
@@ -98,8 +95,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
    auto output_tensor = infinicore::Tensor::empty(
        {batch_size, seq_len, hidden_size_},
        q->dtype(),
-        q->device()
-    );
+        q->device());

    for (size_t b = 0; b < batch_size; ++b) {
        // Extract batch item from RoPE output (already computed above for full batch)
@@ -110,13 +106,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat

        // Convert to [n_head, seq_len, head_dim] for cache
        // Ensure contiguous after permute for F16 compatibility with cache operations
-        auto q_rope = q_batch->permute({1, 0, 2})->contiguous();  // [n_q_head, seq_len, head_dim]
-        auto k_rope = k_batch->permute({1, 0, 2})->contiguous();  // [n_kv_head, seq_len, head_dim]
-        auto v_permuted = v_batch->permute({1, 0, 2})->contiguous();  // [n_kv_head, seq_len, head_dim]
+        auto q_rope = q_batch->permute({1, 0, 2})->contiguous();     // [n_q_head, seq_len, head_dim]
+        auto k_rope = k_batch->permute({1, 0, 2})->contiguous();     // [n_kv_head, seq_len, head_dim]
+        auto v_permuted = v_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]

        // 5. Prepare KV caches
-        infinicore::Tensor k_total = infinicore::Tensor::empty({1, 1, 1}, k_rope->dtype(), k_rope->device());
-        infinicore::Tensor v_total = infinicore::Tensor::empty({1, 1, 1}, v_permuted->dtype(), v_permuted->device());
+        infinicore::Tensor k_total;
+        infinicore::Tensor v_total;
        if (external_cache != nullptr) {
            auto [k_total_tmp, v_total_tmp] = external_cache->update(k_rope, v_permuted);
            k_total = k_total_tmp;
@@ -136,11 +132,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
        // Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim])
        // Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
        // Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute
-        auto k_for_attn = k_total->permute({1, 0, 2});  // [total_seq_len, n_kv_head, head_dim]
-        auto v_for_attn = v_total->permute({1, 0, 2});  // [total_seq_len, n_kv_head, head_dim]
+        auto k_for_attn = k_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
+        auto v_for_attn = v_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]

        // q_batch is already [seq_len, n_q_head, head_dim] from above
-        auto q_for_attn = q_batch;  // [seq_len, n_q_head, head_dim]
+        auto q_for_attn = q_batch; // [seq_len, n_q_head, head_dim]

        // Python: grouped_query_attention calls repeat_kv if ngroup > 1
        // Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim]
@@ -154,15 +150,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
            auto k_strides = k_for_attn->strides();
            auto k_strided = k_for_attn->as_strided(
                {total_seq_len, n_kv_head, ngroup, head_dim},
-                {k_strides[0], k_strides[1], 0, k_strides[2]}
-            );
+                {k_strides[0], k_strides[1], 0, k_strides[2]});
            k_for_attn = k_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});

            auto v_strides = v_for_attn->strides();
            auto v_strided = v_for_attn->as_strided(
                {total_seq_len, n_kv_head, ngroup, head_dim},
-                {v_strides[0], v_strides[1], 0, v_strides[2]}
-            );
+                {v_strides[0], v_strides[1], 0, v_strides[2]});
            v_for_attn = v_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
        }

@@ -170,26 +164,25 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
        // Python: Q = querys.permute((1, 0, 2))  # [num_heads, seq_len, head_dim]
        // Python: K = keys  # [total_seq_len, num_heads, head_dim] (NO permute!)
        // Python: V = values.permute((1, 0, 2))  # [num_heads, total_seq_len, head_dim]
-        auto Q = q_for_attn->permute({1, 0, 2});  // [n_q_head, seq_len, head_dim]
-        auto K = k_for_attn;  // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
-        auto V = v_for_attn->permute({1, 0, 2});  // [n_q_head, total_seq_len, head_dim]
+        auto Q = q_for_attn->permute({1, 0, 2}); // [n_q_head, seq_len, head_dim]
+        auto K = k_for_attn;                     // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
+        auto V = v_for_attn->permute({1, 0, 2}); // [n_q_head, total_seq_len, head_dim]

        // Python: attn_weight = Q @ K.permute((1, 2, 0))
        // Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len]
-        auto K_transposed = K->permute({1, 2, 0});  // [n_q_head, head_dim, total_seq_len]
+        auto K_transposed = K->permute({1, 2, 0}); // [n_q_head, head_dim, total_seq_len]

        // Use GEMM with alpha=scaling to combine scaling with matrix multiplication
        // This is more efficient than doing matmul followed by mul
        float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
-        auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling);  // [n_q_head, seq_len, total_seq_len]
-
+        auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling); // [n_q_head, seq_len, total_seq_len]

        infinicore::op::causal_softmax_(attn_weight, attn_weight);

-        auto out = infinicore::op::matmul(attn_weight, V);  // [n_q_head, seq_len, head_dim]
+        auto out = infinicore::op::matmul(attn_weight, V); // [n_q_head, seq_len, head_dim]

        // Python: return out.permute((1, 0, 2)).contiguous()  # [seq_len, num_heads, head_dim]
-        auto attn_output = out->permute({1, 0, 2})->contiguous();  // [seq_len, n_q_head, head_dim]
+        auto attn_output = out->permute({1, 0, 2})->contiguous(); // [seq_len, n_q_head, head_dim]

        // Python: attn_output_i.copy_(attention_i)
        // Python: attn_output = attn_output.view(hidden_states_shape)  # [bs, seq_len, hidden_size]

--- a/csrc/models/pybind11/models.cc
+++ b/csrc/models/pybind11/models.cc
-#include <pybind11/pybind11.h>
 #include "models/llama.hpp"
+#include <pybind11/pybind11.h>

 namespace py = pybind11;

-PYBIND11_MODULE(_infinilm_llama, m) {
+PYBIND11_MODULE(_infinilm, m) {
    m.doc() = "InfiniLM Llama model Python bindings";

    infinilm::models::llama::bind_llama(m);

--- a/python/infinilm/lib/__init__.py
+++ b/python/infinilm/lib/__init__.py
@@ -14,6 +14,6 @@ if str(_lib_dir) not in sys.path:

 # Import the compiled C++ module
 # The .so file should be installed in this directory by xmake
-import _infinilm_llama
+import _infinilm

-__all__ = ["_infinilm_llama"]
+__all__ = ["_infinilm"]
--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
 from ....generation.utils import GenerationMixin
 import infinicore
 from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
-from infinilm.lib import _infinilm_llama
+from infinilm.lib import _infinilm
 import json
 import os
 from typing import Optional, Union
@@ -49,7 +49,7 @@ class LlamaConfig:
    def _underlying(self):
        """Get underlying C++ config object, creating it if needed"""
        if self._cpp_config is None:
-            self._cpp_config = _infinilm_llama.LlamaConfig()
+            self._cpp_config = _infinilm.LlamaConfig()

            # Copy attributes from Python config to C++ config
            for key in dir(self._python_config):
@@ -107,7 +107,7 @@ class LlamaForCausalLM(GenerationMixin):
        self.use_cache = False

        self._device = device
-        self._model = _infinilm_llama.LlamaForCausalLM(
+        self._model = _infinilm.LlamaForCausalLM(
            config._underlying, device._underlying, dtype
        )


--- a/setup.py
+++ b/setup.py
@@ -9,8 +9,8 @@ from setuptools.command.egg_info import egg_info

 def build_cpp_module():
    """Build and install the C++ extension module"""
-    subprocess.run(["xmake", "build", "_infinilm_llama"], check=True)
-    subprocess.run(["xmake", "install", "_infinilm_llama"], check=True)
+    subprocess.run(["xmake", "build", "_infinilm"], check=True)
+    subprocess.run(["xmake", "install", "_infinilm"], check=True)


 class Build(build):

--- a/test/models/llama/test_intermediate_validation.py
+++ b/test/models/llama/test_intermediate_validation.py
@@ -27,7 +27,7 @@ except ImportError as e:

 try:
    from infinilm.models.llama import LlamaConfig, LlamaForCausalLM, Device
-    import _infinilm_llama  # Import C++ bindings for HookRegistry
+    import _infinilm  # Import C++ bindings for HookRegistry
 except ImportError as e:
    print(f"Error: InfiniLM Python package not found. Please install it: {e}")
    sys.exit(1)
@@ -756,7 +756,7 @@ def test_intermediate_validation(
        infini_position_ids = torch_to_infinicore_tensor(position_ids, infini_device)

        # Create hook registry and register hooks
-        hook_registry = _infinilm_llama.HookRegistry()
+        hook_registry = _infinilm.HookRegistry()

        def make_infinilm_hook(name):
            def hook(hook_name, tensor, layer_idx):

--- a/test/models/llama/test_llama_inference.py
+++ b/test/models/llama/test_llama_inference.py
@@ -36,7 +36,7 @@ except ImportError as e:
    print(f"Error: InfiniLM Python package not found. Please install it:")
    print(f"  pip install -e .")
    print(f"  or")
-    print(f"  xmake build _infinilm_llama && xmake install _infinilm_llama")
+    print(f"  xmake build _infinilm && xmake install _infinilm")
    print(f"  Error: {e}")
    sys.exit(1)

@@ -487,9 +487,6 @@ def validate_inference(

 def main():
    """Main test function"""
-    # Default model path
-    # default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/LLM-Research/Llama-3.2-1B-Instruct"
-    default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0"

    # Default prompt
    default_prompt = "Hello, how are you?"
@@ -545,8 +542,6 @@ def main():
                sys.exit(1)
            i += 1

-    if model_dir is None:
-        model_dir = default_model_dir

    if not os.path.exists(model_dir):
        print(f"Error: Model directory not found: {model_dir}")
@@ -560,11 +555,11 @@ def main():
        )
        print(f"                         Examples: cpu, cuda, cuda:0, cuda:1")
        print(f"\nExamples:")
-        print(f"  {sys.argv[0]} {default_model_dir}")
-        print(f'  {sys.argv[0]} {default_model_dir} --prompt "What is AI?"')
-        print(f"  {sys.argv[0]} {default_model_dir} --device cuda:0")
+        print(f"  {sys.argv[0]} dir/to/model")
+        print(f'  {sys.argv[0]} dir/to/model --prompt "What is AI?"')
+        print(f"  {sys.argv[0]} dir/to/model --device cuda:0")
        print(
-            f'  {sys.argv[0]} {default_model_dir} --prompt "What is AI?" --device cuda:0'
+            f'  {sys.argv[0]} dir/to/model --prompt "What is AI?" --device cuda:0'
        )
        sys.exit(1)


--- a/xmake.lua
+++ b/xmake.lua
@@ -32,8 +32,7 @@ target("infinicore_infer")
    add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"})
 target_end()

-- Python bindings for Llama model
-target("_infinilm_llama")
+target("_infinilm")
    add_packages("pybind11")
    set_default(false)
    add_rules("python.module", {soabi = true})
@@ -52,8 +51,7 @@ target("_infinilm_llama")
    add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")

    -- Add Llama model files
-    add_files("csrc/models/llama/llama_*.cpp")
-    add_files("csrc/models/debug_utils/*.cpp")
+    add_files("csrc/models/*/*.cpp")
    add_files("csrc/models/pybind11/models.cc")

    set_installdir("python/infinilm")