Commit d6a641d3 authored by Ceng23333's avatar Ceng23333
Browse files

issue/74 add c++ Llama models and align to AutoLlama interface


Signed-off-by: default avatarCeng23333 <441651826@qq.com>
parent 3c6ad521
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include "../../cache/kv_cache.hpp"
#include "../../debug_utils/hooks.hpp"
#include "../../llama/llama.hpp"
#include "../../llama/llama_attention.hpp"
#include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"
#include "infinicore/nn/module.hpp"
namespace py = pybind11;
using infinicore::Device;
using infinilm::models::debug_utils::HookRegistry;
namespace infinilm::models::llama {
inline void bind_llama(py::module &m) {
// TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
// Bind HookRegistry
py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
.def(py::init<>())
.def("register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
// Convert Python callable to C++ function
self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
try {
// Call Python callback with hook name, tensor, and layer index
callback(hook_name, tensor, layer_idx);
} catch (const py::error_already_set &e) {
// Re-raise Python exception
throw;
}
});
}, py::arg("name"), py::arg("callback"))
.def("clear", &HookRegistry::clear)
.def("has_hooks", &HookRegistry::has_hooks);
// Bind LlamaConfig
py::class_<LlamaConfig> config(m, "LlamaConfig");
config
.def(py::init<>())
.def_readwrite("vocab_size", &LlamaConfig::vocab_size)
.def_readwrite("hidden_size", &LlamaConfig::hidden_size)
.def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
.def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
.def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
.def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
.def_readwrite("head_dim", &LlamaConfig::head_dim)
.def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
.def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
.def_readwrite("hidden_act", &LlamaConfig::hidden_act)
.def_readwrite("model_type", &LlamaConfig::model_type)
.def_readwrite("rope_theta", &LlamaConfig::rope_theta)
.def_readwrite("attention_bias", &LlamaConfig::attention_bias)
.def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
.def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
.def_readwrite("use_cache", &LlamaConfig::use_cache)
.def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
.def_readwrite("bos_token_id", &LlamaConfig::bos_token_id)
.def_readwrite("eos_token_id", &LlamaConfig::eos_token_id)
.def("validate", &LlamaConfig::validate)
.def("kv_dim", &LlamaConfig::kv_dim);
// Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
// Helper function to convert Python object (InfiniCore tensor, numpy array, or torch tensor) to C++ Tensor
auto convert_to_tensor = [](py::object obj, const Device &device) -> infinicore::Tensor {
// First check if it's already an InfiniCore tensor (has _underlying attribute)
if (py::hasattr(obj, "_underlying")) {
try {
// Extract the underlying C++ tensor from Python InfiniCore tensor
auto underlying = obj.attr("_underlying");
auto infini_tensor = underlying.cast<infinicore::Tensor>();
return infini_tensor;
} catch (const py::cast_error &) {
// Fall through to other conversion methods
}
}
// Try direct cast (in case it's already a C++ tensor exposed to Python)
try {
auto infini_tensor = obj.cast<infinicore::Tensor>();
return infini_tensor;
} catch (const py::cast_error &) {
// Not an InfiniCore tensor, continue with other conversions
}
// Try to get data pointer and shape from numpy array or torch tensor
void *data_ptr = nullptr;
std::vector<size_t> shape;
infinicore::DataType dtype = infinicore::DataType::F32;
// Check if it's a numpy array
if (py::hasattr(obj, "__array_interface__")) {
auto array_info = obj.attr("__array_interface__");
auto data = array_info["data"];
if (py::isinstance<py::tuple>(data)) {
auto data_tuple = data.cast<py::tuple>();
data_ptr = reinterpret_cast<void *>(data_tuple[0].cast<uintptr_t>());
} else {
data_ptr = reinterpret_cast<void *>(data.cast<uintptr_t>());
}
auto shape_obj = array_info["shape"];
if (py::isinstance<py::tuple>(shape_obj)) {
auto shape_tuple = shape_obj.cast<py::tuple>();
for (auto dim : shape_tuple) {
shape.push_back(dim.cast<size_t>());
}
} else {
shape.push_back(shape_obj.cast<size_t>());
}
// Get dtype
std::string typestr = array_info["typestr"].cast<std::string>();
if (typestr == "<f4" || typestr == "float32") {
dtype = infinicore::DataType::F32;
} else if (typestr == "<f2" || typestr == "float16") {
dtype = infinicore::DataType::F16;
} else if (typestr == "<i4" || typestr == "int32") {
dtype = infinicore::DataType::I32;
} else if (typestr == "<i8" || typestr == "int64") {
dtype = infinicore::DataType::I64;
}
} else if (py::hasattr(obj, "data_ptr")) {
// Try torch tensor
data_ptr = reinterpret_cast<void *>(obj.attr("data_ptr")().cast<uintptr_t>());
auto shape_obj = obj.attr("shape");
if (py::isinstance<py::tuple>(shape_obj) || py::isinstance<py::list>(shape_obj)) {
for (auto dim : shape_obj) {
shape.push_back(dim.cast<size_t>());
}
} else {
shape.push_back(shape_obj.cast<size_t>());
}
// Get dtype from torch tensor
std::string dtype_str = py::str(obj.attr("dtype"));
if (dtype_str.find("float32") != std::string::npos) {
dtype = infinicore::DataType::F32;
} else if (dtype_str.find("float16") != std::string::npos) {
dtype = infinicore::DataType::F16;
} else if (dtype_str.find("int32") != std::string::npos) {
dtype = infinicore::DataType::I32;
} else if (dtype_str.find("int64") != std::string::npos) {
dtype = infinicore::DataType::I64;
}
} else {
throw std::runtime_error("Unsupported tensor type. Expected InfiniCore tensor, numpy array, or torch tensor.");
}
return infinicore::Tensor::from_blob(data_ptr, shape, dtype, device);
};
// Bind LlamaForCausalLM
py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
.def(py::init([](const LlamaConfig &config, const Device &device, py::object dtype_obj) {
infinicore::DataType dtype = infinicore::DataType::F32;
if (!dtype_obj.is_none()) {
// Extract dtype from Python object
if (py::hasattr(dtype_obj, "_underlying")) {
dtype = dtype_obj.attr("_underlying").cast<infinicore::DataType>();
} else {
dtype = dtype_obj.cast<infinicore::DataType>();
}
}
return std::make_shared<LlamaForCausalLM>(config, device, dtype);
}), py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
.def("state_dict", [](const LlamaForCausalLM &model) {
// Convert state_dict to Python dict with shape information
auto state_dict = model.state_dict();
py::dict result;
for (const auto &[name, param] : state_dict) {
// Parameter is a shared_ptr<Tensor>, get shape from it
py::dict param_info;
param_info["shape"] = py::cast(param->shape());
param_info["dtype"] = py::cast(static_cast<int>(param->dtype()));
result[py::cast(name)] = param_info;
}
return result;
})
.def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
// Get actual tensor parameter by name
auto state_dict = model.state_dict();
auto it = state_dict.find(name);
if (it != state_dict.end()) {
// Parameter inherits from Tensor, cast to Tensor for pybind11
const infinicore::Tensor &tensor = it->second;
return tensor;
}
throw std::runtime_error("Parameter '" + name + "' not found in model");
}, py::arg("name"))
.def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
// Convert Python dict to C++ state_dict
std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
for (auto item : state_dict) {
std::string key = item.first.cast<std::string>();
py::object value = item.second.cast<py::object>();
cpp_state_dict.emplace(key, convert_to_tensor(value, device));
}
model.load_state_dict(cpp_state_dict);
}, py::arg("state_dict"), py::arg("device"))
.def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
.def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
// Helper to extract C++ tensor from Python object
auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
// If it's already a Python InfiniCore tensor wrapper, extract underlying
if (py::hasattr(obj, "_underlying")) {
return obj.attr("_underlying").cast<infinicore::Tensor>();
}
// Try direct cast (in case it's already a C++ tensor)
try {
return obj.cast<infinicore::Tensor>();
} catch (const py::cast_error &) {
// Extract device from first tensor for conversion
Device device = Device(Device::Type::CPU, 0);
if (py::hasattr(obj, "device")) {
try {
auto py_device = obj.attr("device");
if (py::hasattr(py_device, "_underlying")) {
device = py_device.attr("_underlying").cast<Device>();
} else {
device = py_device.cast<Device>();
}
} catch (...) {
// Keep default CPU device
}
}
return convert_to_tensor(obj, device);
}
};
// Convert Python tensors to C++ tensors
auto infini_input_ids = get_tensor(input_ids);
auto infini_position_ids = get_tensor(position_ids);
// Handle kv_caches if provided
std::vector<void *> *kv_caches_ptr = nullptr;
return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr);
}, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
}
} // namespace infinilm::models::llama
import infinicore
from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import get_model_state_dict
import infinilm
import argparse
import sys
import time
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
import argparse
import infinilm
from infinilm.modeling_utils import get_model_state_dict
from tokenizers import decoders as _dec
from transformers import AutoTokenizer
import infinicore
def get_args():
parser = argparse.ArgumentParser(description="run Llama args")
......@@ -59,6 +57,12 @@ def get_args():
default="python",
help="python or cpp model",
)
parser.add_argument(
"--dtype",
type=str,
default="float32",
help="float32, float16, bfloat16",
)
return parser.parse_args()
......@@ -112,6 +116,8 @@ def test(
_dec.Fuse(),
]
)
else:
raise ValueError(f"Unsupported model type: {config.model_type}")
# ---------------------------------------------------------------------------- #
# token编码
......@@ -132,6 +138,7 @@ def test(
input_ids_infini = infinicore.from_list(input_ids_list)
t1 = time.time()
print("=================== start generate ====================")
model.generate(
input_ids_infini,
max_new_tokens=max_new_tokens,
......@@ -168,14 +175,21 @@ if __name__ == "__main__":
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys.exit(1)
prompt = "山东最高的山是?"
prompt = "How are you"
model_path = args.model_path
max_new_tokens = args.max_new_tokens
backend = args.backend
infini_device = infinicore.device(device_str, 0)
infini_dtype = infinicore.bfloat16
if args.dtype == "float32":
infini_dtype = infinicore.float32
elif args.dtype == "bfloat16":
infini_dtype = infinicore.bfloat16
elif args.dtype == "float16":
infini_dtype = infinicore.float16
else:
raise ValueError(f"Unsupported dtype: {args.dtype}")
test(
prompt,
......
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[project]
name = "InfiniLM"
version = "0.1.0"
description = "InfiniLM model implementations"
readme = "README.md"
dependencies = []
requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
[project.urls]
Homepage = "https://github.com/InfiniTensor/InfiniLM"
......@@ -246,10 +246,10 @@ class GenerationMixin:
print("\n</s>")
print(
f"\n\n\n Time per step: prefill {round(time_list[0], 2)} token/ms\n",
f"\n\n\n Time per step: prefill {round(time_list[0], 2)} ms/token\n",
)
print(
f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} ms/token \n",
)
return output_tokens_list, output_content
"""
InfiniLM C++ extension module
"""
import sys
import os
from pathlib import Path
# Ensure the directory containing this __init__.py is on sys.path
# This allows importing the .so file from the same directory
_lib_dir = Path(__file__).parent
if str(_lib_dir) not in sys.path:
sys.path.insert(0, str(_lib_dir))
# Import the compiled C++ module
# The .so file should be installed in this directory by xmake
import _infinilm_llama
__all__ = ["_infinilm_llama"]
from ....generation.utils import GenerationMixin
import infinicore
from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
from infinilm.lib import _infinilm_llama
import json
import os
from typing import Optional, Union
class LlamaConfig:
"""Llama model configuration adapter for C++ bindings.
This class wraps configuration_llama.LlamaConfig and provides
a _underlying property that creates the C++ config object.
"""
def __init__(self, config_dict=None, **kwargs):
"""Create LlamaConfig from dictionary or keyword arguments"""
# Use the Python config from configuration_llama
if isinstance(config_dict, _LlamaConfig):
self._python_config = config_dict
else:
if config_dict is not None and isinstance(config_dict, dict):
merged = {**config_dict, **kwargs}
else:
merged = kwargs
self._python_config = _LlamaConfig(**merged)
# Lazy initialization of C++ config
self._cpp_config = None
def __getattr__(self, name):
"""Delegate attribute access to Python config"""
return getattr(self._python_config, name)
def __setattr__(self, name, value):
"""Delegate attribute setting to Python config"""
if name.startswith("_"):
super().__setattr__(name, value)
else:
if hasattr(self, "_python_config"):
setattr(self._python_config, name, value)
# Invalidate C++ config cache when Python config changes
self._cpp_config = None
else:
super().__setattr__(name, value)
@property
def _underlying(self):
"""Get underlying C++ config object, creating it if needed"""
if self._cpp_config is None:
self._cpp_config = _infinilm_llama.LlamaConfig()
# Copy attributes from Python config to C++ config
for key in dir(self._python_config):
if key.startswith("_"):
continue
try:
value = getattr(self._python_config, key)
if hasattr(self._cpp_config, key) and not callable(value):
setattr(self._cpp_config, key, value)
except (AttributeError, TypeError):
pass
# Handle defaults
if (
not hasattr(self._cpp_config, "num_key_value_heads")
or self._cpp_config.num_key_value_heads == 0
):
self._cpp_config.num_key_value_heads = (
self._cpp_config.num_attention_heads
)
if (
not hasattr(self._cpp_config, "head_dim")
or self._cpp_config.head_dim == 0
):
self._cpp_config.head_dim = (
self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
)
return self._cpp_config
class LlamaForCausalLM(GenerationMixin):
def __init__(self):
"""Llama model for causal language modeling"""
def __init__(self, config, device=None, dtype=None):
"""
Create LlamaForCausalLM
Args:
config: LlamaConfig instance or dict
device: Device instance (defaults to CPU)
dtype: Optional dtype for model parameters (defaults to None)
"""
super().__init__()
if isinstance(config, dict):
config = LlamaConfig(**config)
elif not isinstance(config, LlamaConfig):
config = LlamaConfig(**config)
if device is None:
device = infinicore.device()
self.use_cache = False
self._model = None
raise NotImplementedError("NotImplementedError!!")
self._device = device
self._model = _infinilm_llama.LlamaForCausalLM(
config._underlying, device._underlying, dtype
)
def state_dict(self):
"""Get model state dictionary with parameter shapes"""
return self._model.state_dict()
def load_state_dict(self, state_dict):
"""
Load state dictionary into the model
Args:
state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
"""
self._model.load_state_dict(state_dict, self._device._underlying)
def get_parameter(self, name):
"""
Get a parameter tensor by name
Args:
name: Parameter name
Returns:
InfiniCore tensor
"""
return self._model.get_parameter(name)
@property
def config(self):
"""Get model configuration"""
return self._model.config()
def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None
......@@ -24,15 +154,26 @@ class LlamaForCausalLM(GenerationMixin):
def from_pretrained(
cls,
model_path: Union[str, os.PathLike],
device: infinicore.device,
dtype=infinicore.dtype,
device: Optional[infinicore.device] = None,
dtype: Optional[infinicore.dtype] = None,
):
"""
Load a pretrained LlamaForCausalLM model from a directory.
Args:
model_path: Path to the model directory containing config.json
device: Device instance (defaults to CPU)
dtype: Optional dtype for model parameters (defaults to None)
Returns:
LlamaForCausalLM instance
"""
raise NotImplementedError("NotImplementedError!!")
config_path = os.path.join(model_path, "config.json")
if not os.path.exists(config_path):
raise FileNotFoundError(f"Config file not found: {config_path}")
with open(config_path, "r") as f:
config_dict = json.load(f)
config = LlamaConfig(config_dict)
return cls(config, device=device, dtype=dtype)
......@@ -49,7 +49,7 @@ def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
def multi_head_attention(
querys: infinicore.Tensor, # [seq_len, num_heads, head_dim]
keys: infinicore.Tensor, # [total_seq_len, num_heads, head_dim]
keys: infinicore.Tensor, # [total_seq_len, num_heads, head_dim]
values: infinicore.Tensor, # [total_seq_len, num_heads, head_dim]
scaling: float,
):
......@@ -81,9 +81,11 @@ def multi_head_attention(
def grouped_query_attention(
querys: infinicore.Tensor, # [seq_len, num_attention_heads, head_dim]
keys: infinicore.Tensor, # [total_seq_len, num_key_value_heads, head_dim]
values: infinicore.Tensor, # [total_seq_len, num_key_value_heads, head_dim]
# [seq_len, num_attention_heads, head_dim]
querys: infinicore.Tensor,
keys: infinicore.Tensor, # [total_seq_len, num_key_value_heads, head_dim]
# [total_seq_len, num_key_value_heads, head_dim]
values: infinicore.Tensor,
scaling: float,
):
num_attention_heads = querys.shape[1]
......@@ -175,7 +177,7 @@ class LlamaAttention(infinicore.nn.Module):
**kwargs,
) -> infinicore.Tensor:
hidden_states_shape = hidden_states.shape # [bs, seq_len, hidden_size]
bs, seq_len = hidden_states_shape[:-1] # [bs, seq_len]
bs, seq_len = hidden_states_shape[:-1] # [bs, seq_len]
querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
......
import subprocess
from pathlib import Path
from setuptools import setup
from setuptools.command.build import build
from setuptools.command.develop import develop
from setuptools.command.egg_info import egg_info
def build_cpp_module():
"""Build and install the C++ extension module"""
subprocess.run(["xmake", "build", "_infinilm_llama"], check=True)
subprocess.run(["xmake", "install", "_infinilm_llama"], check=True)
class Build(build):
def run(self):
build_cpp_module()
super().run()
class Develop(develop):
def run(self):
build_cpp_module()
super().run()
class EggInfo(egg_info):
def run(self):
# Ensure C++ module is built before creating egg-info
build_cpp_module()
super().run()
setup(
name="InfiniLM",
version="0.1.0",
description="InfiniLM model implementations",
package_dir={"": "python"},
packages=["infinilm", "infinilm.models", "infinilm.lib"],
cmdclass={
"build": Build,
"develop": Develop,
"egg_info": EggInfo,
},
python_requires=">=3.10",
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Subproject commit 88a0e07ad5bb3e2651cd5613530b3f06a15fc400
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment