issue/74 add c++ Llama models and align to AutoLlama interface

Signed-off-by: Ceng23333 <441651826@qq.com>

issue/74 add c++ Llama models and align to AutoLlama interface
Signed-off-by: Ceng23333 <441651826@qq.com>
d6a641d3 · Ceng23333 · 3c6ad521 · d6a641d3 · d6a641d3 · d6a641d3
Commit d6a641d3 authored Dec 02, 2025 by Ceng23333
14 changed files
--- a/csrc/models/pybind11/models/llama.hpp
+++ b/csrc/models/pybind11/models/llama.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include "../../cache/kv_cache.hpp"
+#include "../../debug_utils/hooks.hpp"
+#include "../../llama/llama.hpp"
+#include "../../llama/llama_attention.hpp"
+#include "infinicore/device.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/nn/module.hpp"
+
+namespace py = pybind11;
+using infinicore::Device;
+using infinilm::models::debug_utils::HookRegistry;
+
+namespace infinilm::models::llama {
+
+inline void bind_llama(py::module &m) {
+    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
+    // Bind HookRegistry
+    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
+        .def(py::init<>())
+        .def("register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
+            // Convert Python callable to C++ function
+            self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
+                try {
+                    // Call Python callback with hook name, tensor, and layer index
+                    callback(hook_name, tensor, layer_idx);
+                } catch (const py::error_already_set &e) {
+                    // Re-raise Python exception
+                    throw;
+                }
+            });
+        }, py::arg("name"), py::arg("callback"))
+        .def("clear", &HookRegistry::clear)
+        .def("has_hooks", &HookRegistry::has_hooks);
+
+    // Bind LlamaConfig
+    py::class_<LlamaConfig> config(m, "LlamaConfig");
+    config
+        .def(py::init<>())
+        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
+        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)
+        .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
+        .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
+        .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
+        .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
+        .def_readwrite("head_dim", &LlamaConfig::head_dim)
+        .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
+        .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
+        .def_readwrite("hidden_act", &LlamaConfig::hidden_act)
+        .def_readwrite("model_type", &LlamaConfig::model_type)
+        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
+        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
+        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
+        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
+        .def_readwrite("use_cache", &LlamaConfig::use_cache)
+        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
+        .def_readwrite("bos_token_id", &LlamaConfig::bos_token_id)
+        .def_readwrite("eos_token_id", &LlamaConfig::eos_token_id)
+        .def("validate", &LlamaConfig::validate)
+        .def("kv_dim", &LlamaConfig::kv_dim);
+
+    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
+
+    // Helper function to convert Python object (InfiniCore tensor, numpy array, or torch tensor) to C++ Tensor
+    auto convert_to_tensor = [](py::object obj, const Device &device) -> infinicore::Tensor {
+        // First check if it's already an InfiniCore tensor (has _underlying attribute)
+        if (py::hasattr(obj, "_underlying")) {
+            try {
+                // Extract the underlying C++ tensor from Python InfiniCore tensor
+                auto underlying = obj.attr("_underlying");
+                auto infini_tensor = underlying.cast<infinicore::Tensor>();
+                return infini_tensor;
+            } catch (const py::cast_error &) {
+                // Fall through to other conversion methods
+            }
+        }
+
+        // Try direct cast (in case it's already a C++ tensor exposed to Python)
+        try {
+            auto infini_tensor = obj.cast<infinicore::Tensor>();
+            return infini_tensor;
+        } catch (const py::cast_error &) {
+            // Not an InfiniCore tensor, continue with other conversions
+        }
+
+        // Try to get data pointer and shape from numpy array or torch tensor
+        void *data_ptr = nullptr;
+        std::vector<size_t> shape;
+        infinicore::DataType dtype = infinicore::DataType::F32;
+
+        // Check if it's a numpy array
+        if (py::hasattr(obj, "__array_interface__")) {
+            auto array_info = obj.attr("__array_interface__");
+            auto data = array_info["data"];
+            if (py::isinstance<py::tuple>(data)) {
+                auto data_tuple = data.cast<py::tuple>();
+                data_ptr = reinterpret_cast<void *>(data_tuple[0].cast<uintptr_t>());
+            } else {
+                data_ptr = reinterpret_cast<void *>(data.cast<uintptr_t>());
+            }
+
+            auto shape_obj = array_info["shape"];
+            if (py::isinstance<py::tuple>(shape_obj)) {
+                auto shape_tuple = shape_obj.cast<py::tuple>();
+                for (auto dim : shape_tuple) {
+                    shape.push_back(dim.cast<size_t>());
+                }
+            } else {
+                shape.push_back(shape_obj.cast<size_t>());
+            }
+
+            // Get dtype
+            std::string typestr = array_info["typestr"].cast<std::string>();
+            if (typestr == "<f4" || typestr == "float32") {
+                dtype = infinicore::DataType::F32;
+            } else if (typestr == "<f2" || typestr == "float16") {
+                dtype = infinicore::DataType::F16;
+            } else if (typestr == "<i4" || typestr == "int32") {
+                dtype = infinicore::DataType::I32;
+            } else if (typestr == "<i8" || typestr == "int64") {
+                dtype = infinicore::DataType::I64;
+            }
+        } else if (py::hasattr(obj, "data_ptr")) {
+            // Try torch tensor
+            data_ptr = reinterpret_cast<void *>(obj.attr("data_ptr")().cast<uintptr_t>());
+            auto shape_obj = obj.attr("shape");
+            if (py::isinstance<py::tuple>(shape_obj) || py::isinstance<py::list>(shape_obj)) {
+                for (auto dim : shape_obj) {
+                    shape.push_back(dim.cast<size_t>());
+                }
+            } else {
+                shape.push_back(shape_obj.cast<size_t>());
+            }
+
+            // Get dtype from torch tensor
+            std::string dtype_str = py::str(obj.attr("dtype"));
+            if (dtype_str.find("float32") != std::string::npos) {
+                dtype = infinicore::DataType::F32;
+            } else if (dtype_str.find("float16") != std::string::npos) {
+                dtype = infinicore::DataType::F16;
+            } else if (dtype_str.find("int32") != std::string::npos) {
+                dtype = infinicore::DataType::I32;
+            } else if (dtype_str.find("int64") != std::string::npos) {
+                dtype = infinicore::DataType::I64;
+            }
+        } else {
+            throw std::runtime_error("Unsupported tensor type. Expected InfiniCore tensor, numpy array, or torch tensor.");
+        }
+
+        return infinicore::Tensor::from_blob(data_ptr, shape, dtype, device);
+    };
+
+    // Bind LlamaForCausalLM
+    py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
+        .def(py::init([](const LlamaConfig &config, const Device &device, py::object dtype_obj) {
+            infinicore::DataType dtype = infinicore::DataType::F32;
+            if (!dtype_obj.is_none()) {
+                // Extract dtype from Python object
+                if (py::hasattr(dtype_obj, "_underlying")) {
+                    dtype = dtype_obj.attr("_underlying").cast<infinicore::DataType>();
+                } else {
+                    dtype = dtype_obj.cast<infinicore::DataType>();
+                }
+            }
+            return std::make_shared<LlamaForCausalLM>(config, device, dtype);
+        }), py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
+        .def("state_dict", [](const LlamaForCausalLM &model) {
+            // Convert state_dict to Python dict with shape information
+            auto state_dict = model.state_dict();
+            py::dict result;
+            for (const auto &[name, param] : state_dict) {
+                // Parameter is a shared_ptr<Tensor>, get shape from it
+                py::dict param_info;
+                param_info["shape"] = py::cast(param->shape());
+                param_info["dtype"] = py::cast(static_cast<int>(param->dtype()));
+                result[py::cast(name)] = param_info;
+            }
+            return result;
+        })
+        .def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
+            // Get actual tensor parameter by name
+            auto state_dict = model.state_dict();
+            auto it = state_dict.find(name);
+            if (it != state_dict.end()) {
+                // Parameter inherits from Tensor, cast to Tensor for pybind11
+                const infinicore::Tensor &tensor = it->second;
+                return tensor;
+            }
+            throw std::runtime_error("Parameter '" + name + "' not found in model");
+        }, py::arg("name"))
+        .def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
+            // Convert Python dict to C++ state_dict
+            std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
+            for (auto item : state_dict) {
+                std::string key = item.first.cast<std::string>();
+                py::object value = item.second.cast<py::object>();
+                cpp_state_dict.emplace(key, convert_to_tensor(value, device));
+            }
+            model.load_state_dict(cpp_state_dict);
+        }, py::arg("state_dict"), py::arg("device"))
+        .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
+        .def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
+            // Helper to extract C++ tensor from Python object
+            auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
+                // If it's already a Python InfiniCore tensor wrapper, extract underlying
+                if (py::hasattr(obj, "_underlying")) {
+                    return obj.attr("_underlying").cast<infinicore::Tensor>();
+                }
+                // Try direct cast (in case it's already a C++ tensor)
+                try {
+                    return obj.cast<infinicore::Tensor>();
+                } catch (const py::cast_error &) {
+                    // Extract device from first tensor for conversion
+                    Device device = Device(Device::Type::CPU, 0);
+                    if (py::hasattr(obj, "device")) {
+                        try {
+                            auto py_device = obj.attr("device");
+                            if (py::hasattr(py_device, "_underlying")) {
+                                device = py_device.attr("_underlying").cast<Device>();
+                            } else {
+                                device = py_device.cast<Device>();
+                            }
+                        } catch (...) {
+                            // Keep default CPU device
+                        }
+                    }
+                    return convert_to_tensor(obj, device);
+                }
+            };
+
+            // Convert Python tensors to C++ tensors
+            auto infini_input_ids = get_tensor(input_ids);
+            auto infini_position_ids = get_tensor(position_ids);
+
+            // Handle kv_caches if provided
+            std::vector<void *> *kv_caches_ptr = nullptr;
+
+            return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr);
+        }, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
+}
+
+} // namespace infinilm::models::llama
--- a/examples/llama.py
+++ b/examples/llama.py
+import infinicore
+from transformers import AutoTokenizer
+from tokenizers import decoders as _dec
+from infinilm.modeling_utils import get_model_state_dict
+import infinilm
+import argparse
 import sys
 import time
 import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))

-import argparse
-import infinilm
-from infinilm.modeling_utils import get_model_state_dict
-from tokenizers import decoders as _dec
-from transformers import AutoTokenizer
-
-import infinicore
-

 def get_args():
    parser = argparse.ArgumentParser(description="run Llama args")
@@ -59,6 +57,12 @@ def get_args():
        default="python",
        help="python or cpp model",
    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float32",
+        help="float32, float16, bfloat16",
+    )
    return parser.parse_args()


@@ -112,6 +116,8 @@ def test(
                    _dec.Fuse(),
                ]
            )
+    else:
+        raise ValueError(f"Unsupported model type: {config.model_type}")

    # ---------------------------------------------------------------------------- #
    #                        token编码
@@ -132,6 +138,7 @@ def test(
    input_ids_infini = infinicore.from_list(input_ids_list)

    t1 = time.time()
+    print("=================== start generate ====================")
    model.generate(
        input_ids_infini,
        max_new_tokens=max_new_tokens,
@@ -168,14 +175,21 @@ if __name__ == "__main__":
            "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
        )
        sys.exit(1)
-    prompt = "山东最高的山是？"
+    prompt = "How are you"

    model_path = args.model_path
    max_new_tokens = args.max_new_tokens
    backend = args.backend

    infini_device = infinicore.device(device_str, 0)
-    infini_dtype = infinicore.bfloat16
+    if args.dtype == "float32":
+        infini_dtype = infinicore.float32
+    elif args.dtype == "bfloat16":
+        infini_dtype = infinicore.bfloat16
+    elif args.dtype == "float16":
+        infini_dtype = infinicore.float16
+    else:
+        raise ValueError(f"Unsupported dtype: {args.dtype}")

    test(
        prompt,

--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "InfiniLM"
+version = "0.1.0"
+description = "InfiniLM model implementations"
+readme = "README.md"
+dependencies = []
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[project.urls]
+Homepage = "https://github.com/InfiniTensor/InfiniLM"
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
@@ -246,10 +246,10 @@ class GenerationMixin:

        print("\n</s>")
        print(
-            f"\n\n\n Time per step:  prefill {round(time_list[0], 2)} token/ms\n",
+            f"\n\n\n Time per step:  prefill {round(time_list[0], 2)} ms/token\n",
        )
        print(
-            f" Time per step:  decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
+            f" Time per step:  decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} ms/token \n",
        )

        return output_tokens_list, output_content
--- a/python/infinilm/lib/__init__.py
+++ b/python/infinilm/lib/__init__.py
+"""
+InfiniLM C++ extension module
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Ensure the directory containing this __init__.py is on sys.path
+# This allows importing the .so file from the same directory
+_lib_dir = Path(__file__).parent
+if str(_lib_dir) not in sys.path:
+    sys.path.insert(0, str(_lib_dir))
+
+# Import the compiled C++ module
+# The .so file should be installed in this directory by xmake
+import _infinilm_llama
+
+__all__ = ["_infinilm_llama"]
--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
 from ....generation.utils import GenerationMixin
 import infinicore
+from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
+from infinilm.lib import _infinilm_llama
+import json
 import os
 from typing import Optional, Union


+class LlamaConfig:
+    """Llama model configuration adapter for C++ bindings.
+
+    This class wraps configuration_llama.LlamaConfig and provides
+    a _underlying property that creates the C++ config object.
+    """
+
+    def __init__(self, config_dict=None, **kwargs):
+        """Create LlamaConfig from dictionary or keyword arguments"""
+        # Use the Python config from configuration_llama
+        if isinstance(config_dict, _LlamaConfig):
+            self._python_config = config_dict
+        else:
+            if config_dict is not None and isinstance(config_dict, dict):
+                merged = {**config_dict, **kwargs}
+            else:
+                merged = kwargs
+            self._python_config = _LlamaConfig(**merged)
+
+        # Lazy initialization of C++ config
+        self._cpp_config = None
+
+    def __getattr__(self, name):
+        """Delegate attribute access to Python config"""
+        return getattr(self._python_config, name)
+
+    def __setattr__(self, name, value):
+        """Delegate attribute setting to Python config"""
+        if name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            if hasattr(self, "_python_config"):
+                setattr(self._python_config, name, value)
+                # Invalidate C++ config cache when Python config changes
+                self._cpp_config = None
+            else:
+                super().__setattr__(name, value)
+
+    @property
+    def _underlying(self):
+        """Get underlying C++ config object, creating it if needed"""
+        if self._cpp_config is None:
+            self._cpp_config = _infinilm_llama.LlamaConfig()
+
+            # Copy attributes from Python config to C++ config
+            for key in dir(self._python_config):
+                if key.startswith("_"):
+                    continue
+                try:
+                    value = getattr(self._python_config, key)
+                    if hasattr(self._cpp_config, key) and not callable(value):
+                        setattr(self._cpp_config, key, value)
+                except (AttributeError, TypeError):
+                    pass
+
+            # Handle defaults
+            if (
+                not hasattr(self._cpp_config, "num_key_value_heads")
+                or self._cpp_config.num_key_value_heads == 0
+            ):
+                self._cpp_config.num_key_value_heads = (
+                    self._cpp_config.num_attention_heads
+                )
+
+            if (
+                not hasattr(self._cpp_config, "head_dim")
+                or self._cpp_config.head_dim == 0
+            ):
+                self._cpp_config.head_dim = (
+                    self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
+                )
+
+        return self._cpp_config
+
+
 class LlamaForCausalLM(GenerationMixin):
-    def __init__(self):
+    """Llama model for causal language modeling"""
+
+    def __init__(self, config, device=None, dtype=None):
+        """
+        Create LlamaForCausalLM
+
+        Args:
+            config: LlamaConfig instance or dict
+            device: Device instance (defaults to CPU)
+            dtype: Optional dtype for model parameters (defaults to None)
+        """
        super().__init__()
+
+        if isinstance(config, dict):
+            config = LlamaConfig(**config)
+        elif not isinstance(config, LlamaConfig):
+            config = LlamaConfig(**config)
+
+        if device is None:
+            device = infinicore.device()
+
        self.use_cache = False
-        self._model = None
-        raise NotImplementedError("NotImplementedError!!")
+
+        self._device = device
+        self._model = _infinilm_llama.LlamaForCausalLM(
+            config._underlying, device._underlying, dtype
+        )
+
+    def state_dict(self):
+        """Get model state dictionary with parameter shapes"""
+        return self._model.state_dict()
+
+    def load_state_dict(self, state_dict):
+        """
+        Load state dictionary into the model
+
+        Args:
+            state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
+        """
+        self._model.load_state_dict(state_dict, self._device._underlying)
+
+    def get_parameter(self, name):
+        """
+        Get a parameter tensor by name
+
+        Args:
+            name: Parameter name
+
+        Returns:
+            InfiniCore tensor
+        """
+        return self._model.get_parameter(name)
+
+    @property
+    def config(self):
+        """Get model configuration"""
+        return self._model.config()

    def forward(self, input_ids, position_ids, *args, **kwargs):
        kv_caches = None
@@ -24,15 +154,26 @@ class LlamaForCausalLM(GenerationMixin):
    def from_pretrained(
        cls,
        model_path: Union[str, os.PathLike],
-        device: infinicore.device,
-        dtype=infinicore.dtype,
+        device: Optional[infinicore.device] = None,
+        dtype: Optional[infinicore.dtype] = None,
    ):
        """
        Load a pretrained LlamaForCausalLM model from a directory.
+
        Args:
            model_path: Path to the model directory containing config.json
            device: Device instance (defaults to CPU)
+            dtype: Optional dtype for model parameters (defaults to None)
+
        Returns:
            LlamaForCausalLM instance
        """
-        raise NotImplementedError("NotImplementedError!!")
+        config_path = os.path.join(model_path, "config.json")
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+
+        config = LlamaConfig(config_dict)
+        return cls(config, device=device, dtype=dtype)
--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
@@ -49,7 +49,7 @@ def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):

 def multi_head_attention(
    querys: infinicore.Tensor,  # [seq_len,       num_heads, head_dim]
-    keys: infinicore.Tensor,  #   [total_seq_len, num_heads, head_dim]
+    keys: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
    values: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
    scaling: float,
 ):
@@ -81,9 +81,11 @@ def multi_head_attention(


 def grouped_query_attention(
-    querys: infinicore.Tensor,  # [seq_len,       num_attention_heads, head_dim]
-    keys: infinicore.Tensor,  #   [total_seq_len, num_key_value_heads, head_dim]
-    values: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
+    # [seq_len,       num_attention_heads, head_dim]
+    querys: infinicore.Tensor,
+    keys: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
+    # [total_seq_len, num_key_value_heads, head_dim]
+    values: infinicore.Tensor,
    scaling: float,
 ):
    num_attention_heads = querys.shape[1]
@@ -175,7 +177,7 @@ class LlamaAttention(infinicore.nn.Module):
        **kwargs,
    ) -> infinicore.Tensor:
        hidden_states_shape = hidden_states.shape  # [bs, seq_len, hidden_size]
-        bs, seq_len = hidden_states_shape[:-1]  #    [bs, seq_len]
+        bs, seq_len = hidden_states_shape[:-1]  # [bs, seq_len]

        querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
        keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)

--- a/setup.py
+++ b/setup.py
+import subprocess
+from pathlib import Path
+
+from setuptools import setup
+from setuptools.command.build import build
+from setuptools.command.develop import develop
+from setuptools.command.egg_info import egg_info
+
+
+def build_cpp_module():
+    """Build and install the C++ extension module"""
+    subprocess.run(["xmake", "build", "_infinilm_llama"], check=True)
+    subprocess.run(["xmake", "install", "_infinilm_llama"], check=True)
+
+
+class Build(build):
+    def run(self):
+        build_cpp_module()
+        super().run()
+
+
+class Develop(develop):
+    def run(self):
+        build_cpp_module()
+        super().run()
+
+
+class EggInfo(egg_info):
+    def run(self):
+        # Ensure C++ module is built before creating egg-info
+        build_cpp_module()
+        super().run()
+
+
+setup(
+    name="InfiniLM",
+    version="0.1.0",
+    description="InfiniLM model implementations",
+    package_dir={"": "python"},
+    packages=["infinilm", "infinilm.models", "infinilm.lib"],
+    cmdclass={
+        "build": Build,
+        "develop": Develop,
+        "egg_info": EggInfo,
+    },
+    python_requires=">=3.10",
+)
--- a/test/models/llama/test_forward_validation.py
+++ b/test/models/llama/test_forward_validation.py
--- a/test/models/llama/test_intermediate_validation.py
+++ b/test/models/llama/test_intermediate_validation.py
--- a/test/models/llama/test_llama_inference.py
+++ b/test/models/llama/test_llama_inference.py
--- a/test/models/llama/utils.py
+++ b/test/models/llama/utils.py
--- a/spdlog @ 88a0e07a
+++ b/spdlog @ 88a0e07a
+Subproject commit 88a0e07ad5bb3e2651cd5613530b3f06a15fc400
--- a/xmake.lua
+++ b/xmake.lua