Commit 38ac084d authored by PanZezhong's avatar PanZezhong
Browse files

issue/95 将pybind target命名为_infinilm

parent ae3ebe19
#pragma once
#include "infinicore/tensor.hpp"
#include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"
#include <algorithm>
#include <utility>
#include <memory>
#include <utility>
namespace infinilm::cache {
......@@ -18,11 +18,11 @@ namespace infinilm::cache {
* that needs KV caching for attention mechanisms.
*/
struct KVCache {
infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim]
infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim]
size_t cache_position; // Current position in cache
size_t max_capacity; // Maximum capacity of cache
bool initialized; // Whether cache has been initialized
infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim]
infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim]
size_t cache_position; // Current position in cache
size_t max_capacity; // Maximum capacity of cache
bool initialized; // Whether cache has been initialized
KVCache()
: cache_position(0), max_capacity(0), initialized(false),
......@@ -41,12 +41,12 @@ struct KVCache {
* @param device Device
*/
void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
infinicore::DataType dtype, const infinicore::Device &device) {
infinicore::DataType dtype, const infinicore::Device &device) {
size_t required_capacity = cache_position + seq_len;
// Lazy initialization
if (!initialized) {
max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
dtype, device);
v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
......@@ -94,7 +94,7 @@ struct KVCache {
// Ensure capacity
ensure_capacity(num_kv_heads, head_dim, seq_len,
k_new->dtype(), k_new->device());
k_new->dtype(), k_new->device());
// Copy new k/v into cache at current position
auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
......@@ -113,4 +113,4 @@ struct KVCache {
}
};
} // namespace infinilm::models::common
} // namespace infinilm::cache
......@@ -3,17 +3,17 @@
#include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops/mul.hpp"
#include <spdlog/spdlog.h>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <stdexcept>
#include <iostream>
#include <algorithm>
#include <spdlog/spdlog.h>
#include <stdexcept>
namespace infinilm::models::llama {
LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
infinicore::DataType dtype)
infinicore::DataType dtype)
: hidden_size_(config.hidden_size),
num_attention_heads_(config.num_attention_heads),
num_key_value_heads_(config.num_key_value_heads),
......@@ -22,19 +22,18 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Devi
use_bias_(config.attention_bias) {
// Initialize projection layers
INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device);
dtype, device);
INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device);
dtype, device);
INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device);
dtype, device);
INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device);
dtype, device);
}
infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
const infinicore::Tensor &position_ids,
void *kv_cache) const {
const infinicore::Tensor &position_ids,
void *kv_cache) const {
if (!rotary_emb_) {
throw std::runtime_error("LlamaAttention: rotary_emb not configured");
}
......@@ -45,12 +44,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
size_t seq_len = shape[1];
// 1. Project Q, K, V
auto q = q_proj_->forward(hidden_states_mutable); // [batch, seq_len, hidden_size]
auto k = k_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
auto q = q_proj_->forward(hidden_states_mutable); // [batch, seq_len, hidden_size]
auto v = v_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
auto k = k_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
auto v = v_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
// 2. Reshape for multi-head attention
......@@ -84,7 +82,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto q_for_rope = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_})->contiguous();
auto k_for_rope = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_})->contiguous();
// Call RoPE on full batch (matching Python pattern)
auto q_rope_out = rotary_emb_->forward(q_for_rope, pos_ids_for_rope);
auto k_rope_out = rotary_emb_->forward(k_for_rope, pos_ids_for_rope);
......@@ -98,8 +95,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto output_tensor = infinicore::Tensor::empty(
{batch_size, seq_len, hidden_size_},
q->dtype(),
q->device()
);
q->device());
for (size_t b = 0; b < batch_size; ++b) {
// Extract batch item from RoPE output (already computed above for full batch)
......@@ -110,13 +106,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Convert to [n_head, seq_len, head_dim] for cache
// Ensure contiguous after permute for F16 compatibility with cache operations
auto q_rope = q_batch->permute({1, 0, 2})->contiguous(); // [n_q_head, seq_len, head_dim]
auto k_rope = k_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
auto v_permuted = v_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
auto q_rope = q_batch->permute({1, 0, 2})->contiguous(); // [n_q_head, seq_len, head_dim]
auto k_rope = k_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
auto v_permuted = v_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
// 5. Prepare KV caches
infinicore::Tensor k_total = infinicore::Tensor::empty({1, 1, 1}, k_rope->dtype(), k_rope->device());
infinicore::Tensor v_total = infinicore::Tensor::empty({1, 1, 1}, v_permuted->dtype(), v_permuted->device());
infinicore::Tensor k_total;
infinicore::Tensor v_total;
if (external_cache != nullptr) {
auto [k_total_tmp, v_total_tmp] = external_cache->update(k_rope, v_permuted);
k_total = k_total_tmp;
......@@ -136,11 +132,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim])
// Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
// Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute
auto k_for_attn = k_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
auto v_for_attn = v_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
auto k_for_attn = k_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
auto v_for_attn = v_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
// q_batch is already [seq_len, n_q_head, head_dim] from above
auto q_for_attn = q_batch; // [seq_len, n_q_head, head_dim]
auto q_for_attn = q_batch; // [seq_len, n_q_head, head_dim]
// Python: grouped_query_attention calls repeat_kv if ngroup > 1
// Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim]
......@@ -154,15 +150,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto k_strides = k_for_attn->strides();
auto k_strided = k_for_attn->as_strided(
{total_seq_len, n_kv_head, ngroup, head_dim},
{k_strides[0], k_strides[1], 0, k_strides[2]}
);
{k_strides[0], k_strides[1], 0, k_strides[2]});
k_for_attn = k_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
auto v_strides = v_for_attn->strides();
auto v_strided = v_for_attn->as_strided(
{total_seq_len, n_kv_head, ngroup, head_dim},
{v_strides[0], v_strides[1], 0, v_strides[2]}
);
{v_strides[0], v_strides[1], 0, v_strides[2]});
v_for_attn = v_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
}
......@@ -170,26 +164,25 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Python: Q = querys.permute((1, 0, 2)) # [num_heads, seq_len, head_dim]
// Python: K = keys # [total_seq_len, num_heads, head_dim] (NO permute!)
// Python: V = values.permute((1, 0, 2)) # [num_heads, total_seq_len, head_dim]
auto Q = q_for_attn->permute({1, 0, 2}); // [n_q_head, seq_len, head_dim]
auto K = k_for_attn; // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
auto V = v_for_attn->permute({1, 0, 2}); // [n_q_head, total_seq_len, head_dim]
auto Q = q_for_attn->permute({1, 0, 2}); // [n_q_head, seq_len, head_dim]
auto K = k_for_attn; // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
auto V = v_for_attn->permute({1, 0, 2}); // [n_q_head, total_seq_len, head_dim]
// Python: attn_weight = Q @ K.permute((1, 2, 0))
// Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len]
auto K_transposed = K->permute({1, 2, 0}); // [n_q_head, head_dim, total_seq_len]
auto K_transposed = K->permute({1, 2, 0}); // [n_q_head, head_dim, total_seq_len]
// Use GEMM with alpha=scaling to combine scaling with matrix multiplication
// This is more efficient than doing matmul followed by mul
float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling); // [n_q_head, seq_len, total_seq_len]
auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling); // [n_q_head, seq_len, total_seq_len]
infinicore::op::causal_softmax_(attn_weight, attn_weight);
auto out = infinicore::op::matmul(attn_weight, V); // [n_q_head, seq_len, head_dim]
auto out = infinicore::op::matmul(attn_weight, V); // [n_q_head, seq_len, head_dim]
// Python: return out.permute((1, 0, 2)).contiguous() # [seq_len, num_heads, head_dim]
auto attn_output = out->permute({1, 0, 2})->contiguous(); // [seq_len, n_q_head, head_dim]
auto attn_output = out->permute({1, 0, 2})->contiguous(); // [seq_len, n_q_head, head_dim]
// Python: attn_output_i.copy_(attention_i)
// Python: attn_output = attn_output.view(hidden_states_shape) # [bs, seq_len, hidden_size]
......
#include <pybind11/pybind11.h>
#include "models/llama.hpp"
#include <pybind11/pybind11.h>
namespace py = pybind11;
PYBIND11_MODULE(_infinilm_llama, m) {
PYBIND11_MODULE(_infinilm, m) {
m.doc() = "InfiniLM Llama model Python bindings";
infinilm::models::llama::bind_llama(m);
......
......@@ -14,6 +14,6 @@ if str(_lib_dir) not in sys.path:
# Import the compiled C++ module
# The .so file should be installed in this directory by xmake
import _infinilm_llama
import _infinilm
__all__ = ["_infinilm_llama"]
__all__ = ["_infinilm"]
from ....generation.utils import GenerationMixin
import infinicore
from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
from infinilm.lib import _infinilm_llama
from infinilm.lib import _infinilm
import json
import os
from typing import Optional, Union
......@@ -49,7 +49,7 @@ class LlamaConfig:
def _underlying(self):
"""Get underlying C++ config object, creating it if needed"""
if self._cpp_config is None:
self._cpp_config = _infinilm_llama.LlamaConfig()
self._cpp_config = _infinilm.LlamaConfig()
# Copy attributes from Python config to C++ config
for key in dir(self._python_config):
......@@ -107,7 +107,7 @@ class LlamaForCausalLM(GenerationMixin):
self.use_cache = False
self._device = device
self._model = _infinilm_llama.LlamaForCausalLM(
self._model = _infinilm.LlamaForCausalLM(
config._underlying, device._underlying, dtype
)
......
......@@ -9,8 +9,8 @@ from setuptools.command.egg_info import egg_info
def build_cpp_module():
"""Build and install the C++ extension module"""
subprocess.run(["xmake", "build", "_infinilm_llama"], check=True)
subprocess.run(["xmake", "install", "_infinilm_llama"], check=True)
subprocess.run(["xmake", "build", "_infinilm"], check=True)
subprocess.run(["xmake", "install", "_infinilm"], check=True)
class Build(build):
......
......@@ -27,7 +27,7 @@ except ImportError as e:
try:
from infinilm.models.llama import LlamaConfig, LlamaForCausalLM, Device
import _infinilm_llama # Import C++ bindings for HookRegistry
import _infinilm # Import C++ bindings for HookRegistry
except ImportError as e:
print(f"Error: InfiniLM Python package not found. Please install it: {e}")
sys.exit(1)
......@@ -756,7 +756,7 @@ def test_intermediate_validation(
infini_position_ids = torch_to_infinicore_tensor(position_ids, infini_device)
# Create hook registry and register hooks
hook_registry = _infinilm_llama.HookRegistry()
hook_registry = _infinilm.HookRegistry()
def make_infinilm_hook(name):
def hook(hook_name, tensor, layer_idx):
......
......@@ -36,7 +36,7 @@ except ImportError as e:
print(f"Error: InfiniLM Python package not found. Please install it:")
print(f" pip install -e .")
print(f" or")
print(f" xmake build _infinilm_llama && xmake install _infinilm_llama")
print(f" xmake build _infinilm && xmake install _infinilm")
print(f" Error: {e}")
sys.exit(1)
......@@ -487,9 +487,6 @@ def validate_inference(
def main():
"""Main test function"""
# Default model path
# default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/LLM-Research/Llama-3.2-1B-Instruct"
default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0"
# Default prompt
default_prompt = "Hello, how are you?"
......@@ -545,8 +542,6 @@ def main():
sys.exit(1)
i += 1
if model_dir is None:
model_dir = default_model_dir
if not os.path.exists(model_dir):
print(f"Error: Model directory not found: {model_dir}")
......@@ -560,11 +555,11 @@ def main():
)
print(f" Examples: cpu, cuda, cuda:0, cuda:1")
print(f"\nExamples:")
print(f" {sys.argv[0]} {default_model_dir}")
print(f' {sys.argv[0]} {default_model_dir} --prompt "What is AI?"')
print(f" {sys.argv[0]} {default_model_dir} --device cuda:0")
print(f" {sys.argv[0]} dir/to/model")
print(f' {sys.argv[0]} dir/to/model --prompt "What is AI?"')
print(f" {sys.argv[0]} dir/to/model --device cuda:0")
print(
f' {sys.argv[0]} {default_model_dir} --prompt "What is AI?" --device cuda:0'
f' {sys.argv[0]} dir/to/model --prompt "What is AI?" --device cuda:0'
)
sys.exit(1)
......
......@@ -32,8 +32,7 @@ target("infinicore_infer")
add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"})
target_end()
-- Python bindings for Llama model
target("_infinilm_llama")
target("_infinilm")
add_packages("pybind11")
set_default(false)
add_rules("python.module", {soabi = true})
......@@ -52,8 +51,7 @@ target("_infinilm_llama")
add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")
-- Add Llama model files
add_files("csrc/models/llama/llama_*.cpp")
add_files("csrc/models/debug_utils/*.cpp")
add_files("csrc/models/*/*.cpp")
add_files("csrc/models/pybind11/models.cc")
set_installdir("python/infinilm")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment