issue/76 - 添加python的llama模型实现

3ddffe8d · pengcheng888 · 4fd9d490 · 3ddffe8d · 3ddffe8d · 3ddffe8d
Commit 3ddffe8d authored Nov 20, 2025 by pengcheng888
11 changed files
--- a/README.md
+++ b/README.md
@@ -35,3 +35,19 @@ python scripts/test_perf.py
 ```bash
 python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
 ```
+## 使用方式(新版)
+- 编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
+    - 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+    - 根据硬件平台，选择 xmake 构建配置
+    - 编译安装InfiniCore
+    - 安装 C++ 库
+    - 安装 Python 包
+- 单次推理测试
+    - llama示例
+```bash
+python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>
+```
\ No newline at end of file
--- a/examples/llama.py
+++ b/examples/llama.py
+import sys
+import time
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+import argparse
+import infinilm
+from infinilm.modeling_utils import get_model_state_dict
+from tokenizers import decoders as _dec
+from transformers import AutoTokenizer
+import infinicore
+def get_args():
+    parser = argparse.ArgumentParser(description="run Llama args")
+    parser.add_argument(
+        "--cpu",
+        action="store_true",
+        help="Run cpu test",
+    )
+    parser.add_argument(
+        "--nvidia",
+        action="store_true",
+        help="Run nvidia test",
+    )
+    parser.add_argument(
+        "--metax",
+        action="store_true",
+        help="Run metax test",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="model_path",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=100,
+        help="max_new_tokens",
+    )
+    return parser.parse_args()
+def test(model_path, device_str="cuda", max_new_tokens=100):
+    # ---------------------------------------------------------------------------- #
+    #                        创建模型,
+    # ---------------------------------------------------------------------------- #
+    infini_device = infinicore.device(device_str, 0)
+    infini_dtype = infinicore.bfloat16
+    model = infinilm.LlamaForCausalLM.from_pretrained(
+        model_path,
+        device=infini_device,
+        dtype=infini_dtype,
+    )
+    # ---------------------------------------------------------------------------- #
+    #                        加载权重
+    # ---------------------------------------------------------------------------- #
+    model_param_infini = get_model_state_dict(
+        model_path,
+        device=infini_device,
+        dtype=infini_dtype,
+    )
+    model.load_state_dict(model_param_infini)
+    config = model.config
+    # ---------------------------------------------------------------------------- #
+    #                        创建 tokenizer
+    # ---------------------------------------------------------------------------- #
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if "llama" == config.model_type:
+        backend = getattr(tokenizer, "backend_tokenizer", None)
+        target = getattr(backend, "_tokenizer", backend)
+        norm = getattr(target, "normalizer", None)
+        dec = getattr(target, "decoder", None)
+        sn = repr(norm)[:800] if norm is not None else ""
+        sd = repr(dec)[:800] if dec is not None else ""
+        has_prepend = "Prepend" in sn
+        has_strip = "Strip" in sd
+        if has_prepend and has_strip:
+            target.decoder = _dec.Sequence(
+                [
+                    _dec.Replace("▁", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ]
+            )
+    # ---------------------------------------------------------------------------- #
+    #                        token编码
+    # ---------------------------------------------------------------------------- #
+    prompt = "山东最高的山是？"
+    input_content = tokenizer.apply_chat_template(
+        conversation=[{"role": "user", "content": prompt}],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    print(input_content, end="", flush=True)
+    input_ids = tokenizer.encode(input_content)
+    # ---------------------------------------------------------------------------- #
+    #                        自回归生成
+    # ---------------------------------------------------------------------------- #
+    input_ids_list = [input_ids]  # List: [[1, 1128, 526, 366, 29892]]
+    input_ids_infini = infinicore.from_list(input_ids_list)
+    t1 = time.time()
+    model.generate(
+        input_ids_infini,
+        max_new_tokens=max_new_tokens,
+        device=infini_device,
+        tokenizer=tokenizer,
+        config=config,
+    )
+    t2 = time.time()
+    print(
+        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
+    )
+if __name__ == "__main__":
+    args = get_args()
+    print(args)
+    # Parse command line arguments
+    device_type = "cpu"
+    if args.cpu:
+        device_type = "cpu"
+    elif args.nvidia:
+        device_type = "cuda"
+    elif args.metax:
+        device_type = "cuda"
+    else:
+        print(
+            "Usage:  python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>"
+        )
+        sys.exit(1)
+    model_path = args.model_path
+    max_new_tokens = args.max_new_tokens
+    test(model_path, device_type, max_new_tokens)
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
+from .models import *
--- a/python/infinilm/cache_utils.py
+++ b/python/infinilm/cache_utils.py
+# Copyright (c) 2025, InfiniCore
+#
+# This file contains modified code derived from transformers
+# implementation, which is licensed under the BSD 3-Clause License.
+#
+# The modifications include adaptations for the InfiniCore framework.
+#
+# Original transformers source:
+# https://github.com/huggingface/transformers
+#
+# Referencing PyTorch v4.57.0
+#
+# The use of this file is governed by the BSD 3-Clause License.
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+import transformers.utils.logging as logging
+import infinicore
+logger = logging.get_logger(__name__)
+class CacheLayerMixin(ABC):
+    """Base, abstract class for a single layer's cache."""
+    def __init__(self):
+        self.keys, self.values = None, None
+    def __repr__(self):
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def lazy_initialization(self, key_states: infinicore.Tensor): ...
+    @abstractmethod
+    def update(
+        self,
+        key_states: infinicore.Tensor,
+        value_states: infinicore.Tensor,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[infinicore.Tensor, infinicore.Tensor]: ...
+class DynamicLayer(CacheLayerMixin):
+    """
+    A cache layer that grows dynamically as more tokens are generated.
+    It stores the key and value states as tensors of shape `[batch_size, seq_len, num_heads, head_dim]`.
+    """
+    def __init__(self, max_position_embeddings):
+        super().__init__()
+        self.max_position_embeddings = max_position_embeddings
+        self.cache_position = 0
+    def lazy_initialization(self, key_states: infinicore.Tensor):
+        batch_size, seq_len, num_heads, head_dim = key_states.shape
+        if self.keys is None:
+            dtype, device = key_states.dtype, key_states.device
+            self.cache_position = 0
+            self.max_seq_len = max(self.max_position_embeddings, seq_len)
+            self.keys = infinicore.empty(
+                [batch_size, self.max_seq_len, num_heads, head_dim],
+                dtype=dtype,
+                device=device,
+            )
+            self.values = infinicore.empty(
+                [batch_size, self.max_seq_len, num_heads, head_dim],
+                dtype=dtype,
+                device=device,
+            )
+        elif self.cache_position + seq_len >= self.max_seq_len:
+            dtype, device = key_states.dtype, key_states.device
+            self.max_seq_len = max(self.max_seq_len * 2, self.cache_position + seq_len)
+            keys_new = infinicore.empty(
+                [batch_size, self.max_seq_len, num_heads, head_dim],
+                dtype=dtype,
+                device=device,
+            )
+            values_new = infinicore.empty(
+                [batch_size, self.max_seq_len, num_heads, head_dim],
+                dtype=dtype,
+                device=device,
+            )
+            keys_new.narrow(1, 0, self.cache_position).copy_(
+                self.keys.narrow(1, 0, self.cache_position)
+            )
+            values_new.narrow(1, 0, self.cache_position).copy_(
+                self.values.narrow(1, 0, self.cache_position)
+            )
+            self.keys, self.values = keys_new, values_new
+    def update(
+        self,
+        key_states: infinicore.Tensor,
+        value_states: infinicore.Tensor,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ):
+        # Lazy initialization
+        self.lazy_initialization(key_states)
+        seq_len = key_states.shape[1]
+        index = self.cache_position
+        # Update the cache
+        self.keys.narrow(1, index, seq_len).copy_(key_states)
+        self.values.narrow(1, index, seq_len).copy_(value_states)
+        self.cache_position += seq_len
+        return self.keys.narrow(1, 0, self.cache_position), self.values.narrow(
+            1, 0, self.cache_position
+        )
+class Cache:
+    """
+    A `Cache` is mostly a list of `CacheLayerMixin` objects, one per model layer. It serves as a container for the Cache of each layer.
+    Args:
+        layers (`Optional`, *optional*): A list of pre-created `CacheLayerMixin`.
+    """
+    def __init__(
+        self,
+        layers: Optional[list[CacheLayerMixin]] = None,
+    ):
+        self.layers = layers if layers is not None else []
+    def update(
+        self,
+        key_states: infinicore.Tensor,
+        value_states: infinicore.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[infinicore.Tensor, infinicore.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`infinicore.Tensor`):
+                The new key states to cache.
+            value_states (`infinicore.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`dict[str, Any]`, *optional*):
+                Additional arguments for the cache subclass.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        keys, values = self.layers[layer_idx].update(
+            key_states, value_states, cache_kwargs
+        )
+        return keys.contiguous(), values.contiguous()
+class DynamicCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the key and value states as a list of `CacheLayer`, one for each layer.
+    Args:
+        config (`PretrainedConfig`, *optional*):
+            The config of the model for which this Cache will be used..
+    """
+    def __init__(
+        self,
+        config=None,
+    ):
+        max_position_embeddings = config.max_position_embeddings
+        layers = []
+        # If a config is passed, use it to infer the layer types and initialize accordingly
+        if config is not None:
+            config = config.get_text_config()
+            layer_types = None
+            if layer_types is None:
+                layer_types = [
+                    "full_attention" for _ in range(config.num_hidden_layers)
+                ]
+            for layer_type in layer_types:
+                layers.append(DynamicLayer(max_position_embeddings))
+        super().__init__(
+            layers=layers,
+        )
--- a/python/infinilm/configuration_utils.py
+++ b/python/infinilm/configuration_utils.py
+# Copyright (c) 2025, InfiniCore
+#
+# This file contains modified code derived from transformers
+# implementation, which is licensed under the BSD 3-Clause License.
+#
+# The modifications include adaptations for the InfiniCore framework.
+#
+# Original transformers source:
+# https://github.com/huggingface/transformers
+#
+# Referencing PyTorch v4.57.0
+#
+# The use of this file is governed by the BSD 3-Clause License.
+import copy
+from typing import Any
+class PretrainedConfig:
+    def __init__(*args, **kwargs):
+        pass
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+        Returns:
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+        # Transformers version when serializing the model
+        output["transformers_version"] = "None"
+        for key, value in output.items():
+            # Deal with nested configs like CLIP
+            if isinstance(value, PretrainedConfig):
+                value = value.to_dict()
+                del value["transformers_version"]
+            output[key] = value
+        self.dict_dtype_to_str(output)
+        return output
+    def is_encoder_decoder(self):
+        return False
+    def dict_dtype_to_str(self, d: dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary and its nested dicts have a *dtype* key and if it's not None,
+        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
+        string, which can then be stored in the json format.
+        """
+        if d.get("dtype") is not None and not isinstance(d["dtype"], str):
+            d["dtype"] = str(d["dtype"]).split(".")[1]
+        for value in d.values():
+            if isinstance(value, dict):
+                self.dict_dtype_to_str(value)
+    def get_text_config(self, decoder=None, encoder=None):
+        return_both = (
+            decoder == encoder
+        )  # both unset or both set -> search all possible names
+        decoder_possible_text_config_names = ("decoder", "generator", "text_config")
+        encoder_possible_text_config_names = ("text_encoder",)
+        if return_both:
+            possible_text_config_names = (
+                encoder_possible_text_config_names + decoder_possible_text_config_names
+            )
+        elif decoder:
+            possible_text_config_names = decoder_possible_text_config_names
+        else:
+            possible_text_config_names = encoder_possible_text_config_names
+        valid_text_config_names = []
+        for text_config_name in possible_text_config_names:
+            if hasattr(self, text_config_name):
+                text_config = getattr(self, text_config_name, None)
+                if text_config is not None:
+                    valid_text_config_names += [text_config_name]
+        if len(valid_text_config_names) > 1:
+            raise ValueError(
+                f"Multiple valid text configs were found in the model config: {valid_text_config_names}. In this "
+                "case, using `get_text_config()` would be ambiguous. Please specify the desired text config directly, "
+                "e.g. `text_config = config.sub_config_name`"
+            )
+        elif len(valid_text_config_names) == 1:
+            config_to_return = getattr(self, valid_text_config_names[0])
+        else:
+            config_to_return = self
+        # handle legacy models with flat config structure, when we only want one of the configs
+        if (
+            not return_both
+            and len(valid_text_config_names) == 0
+            and config_to_return.is_encoder_decoder
+        ):
+            config_to_return = copy.deepcopy(config_to_return)
+            prefix_to_discard = "encoder" if decoder else "decoder"
+            for key in config_to_return.to_dict():
+                if key.startswith(prefix_to_discard):
+                    delattr(config_to_return, key)
+            # old encoder/decoder models may use "encoder_layers"/"decoder_layers" instead of "num_hidden_layers"
+            if decoder and hasattr(config_to_return, "decoder_layers"):
+                config_to_return.num_hidden_layers = config_to_return.decoder_layers
+            elif encoder and hasattr(config_to_return, "encoder_layers"):
+                config_to_return.num_hidden_layers = config_to_return.encoder_layers
+        return config_to_return
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
+import time
+from typing import Optional
+import infinicore
+from ..cache_utils import Cache, DynamicCache
+import numpy as np
+def infini_to_ctype_dtype(infini_dtype):
+    """Convert PyTorch data type to infinicore data type"""
+    import ctypes
+    if infini_dtype == infinicore.int32:
+        return ctypes.c_int32
+    elif infini_dtype == infinicore.float32:
+        return ctypes.c_float
+    else:
+        raise ValueError(f"Unsupported py_dtype: {infini_dtype}")
+def infini_to_numpy(infini_tensor: infinicore.Tensor):
+    if infini_tensor.device.type != "cpu":
+        infini_tensor_cpu = infini_tensor.to(infinicore.device("cpu", 0))
+    else:
+        infini_tensor_cpu = infini_tensor
+    # 获取数据指针和形状信息
+    data_ptr = infini_tensor_cpu.data_ptr()
+    num_elements = infini_tensor_cpu.numel()
+    original_shape = infini_tensor_cpu.shape
+    # 创建1D NumPy数组（共享内存）
+    ArrayType = infini_to_ctype_dtype(infini_tensor_cpu.dtype) * num_elements
+    array = ArrayType.from_address(data_ptr)
+    np_flat = np.ctypeslib.as_array(array)
+    # 重塑为原始形状
+    np_array = np_flat.reshape(original_shape)
+    return np.copy(np_array)
+infinicore.Tensor.to_numpy = infini_to_numpy
+class GenerationMixin:
+    def _get_initial_cache_position(
+        self,
+        bs: int,
+        seq_length: int,
+        device: infinicore.device,
+    ) -> infinicore.Tensor:
+        """Calculates `cache_position` for the pre-fill stage"""
+        cache_position_list = [list(range(0, seq_length)) for i in range(bs)]
+        return infinicore.from_list(
+            cache_position_list, dtype=infinicore.int64, device=device
+        )
+    def prepare_inputs_for_generation(
+        self,
+        device: infinicore.device,
+        past_key_values: Optional[Cache] = None,
+        **kwargs,
+    ):
+        """Prepare the model inputs for generation."""
+        # 1. Handle BC:
+        model_inputs = {}
+        # -------------------------------------------------------------------- #
+        #                 所需的: KV Cache
+        # -------------------------------------------------------------------- #
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+        # -------------------------------------------------------------------------- #
+        #                     计算所需的，cache_position
+        # -------------------------------------------------------------------------- #
+        current_cache_position = kwargs.get("cache_position", None)
+        if current_cache_position is None:
+            # prill阶段
+            bs, seq_len = kwargs["input_ids"].shape[0:2]
+            model_inputs["cache_position"] = self._get_initial_cache_position(
+                bs, seq_len, device
+            )
+        else:
+            # decoder 阶段
+            bs, seq_len = current_cache_position.shape
+            last_position = current_cache_position.narrow(1, seq_len - 1, 1)
+            one_value = infinicore.from_list(
+                [1],
+                dtype=last_position.dtype,
+                device=last_position.device,
+            ).view((bs, 1))
+            next_position = one_value + last_position
+            model_inputs["cache_position"] = next_position
+        # -------------------------------------------------------------------- #
+        #                 所需的: token的input_ids
+        # -------------------------------------------------------------------- #
+        if kwargs.get("next_token_id", None) is not None:
+            next_token_id = kwargs["next_token_id"]
+            model_inputs["input_ids"] = infinicore.from_list([[next_token_id]])
+        # -------------------------------------------------------------------- #
+        #                 其他
+        # -------------------------------------------------------------------- #
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        return model_inputs
+    def generate(
+        self,
+        input_ids: infinicore.Tensor,
+        max_new_tokens: int,
+        device: infinicore.device,
+        tokenizer,
+        config,
+        **kwargs,
+    ):
+        model_kwargs = kwargs
+        # -------------------------------------------------------------------- #
+        #                       创建 cache                                      #
+        # -------------------------------------------------------------------- #
+        model_kwargs["use_cache"] = True
+        model_kwargs["past_key_values"] = DynamicCache(config=self.config)
+        # -------------------------------------------------------------------- #
+        #                       _sample函数                                     #
+        # -------------------------------------------------------------------- #
+        result = self._sample(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            device=device,
+            tokenizer=tokenizer,
+            config=config,
+            **model_kwargs,
+        )
+        return result
+    def _sample(
+        self,
+        input_ids: infinicore.Tensor,
+        max_new_tokens: int,
+        device: infinicore.device,
+        tokenizer,
+        config,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+        Parameters:
+            input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
+            max_new_tokens: Maximum number of new tokens.
+            device: infinicore.device.
+            tokenizer: translating data into raw text.
+        """
+        batch_size, seq_len = input_ids.shape[:2]
+        eos_token_id = config.eos_token_id
+        eos_token_id_list = (
+            [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
+        )
+        # -------------------------------------------------------------------------- #
+        #                     初始化 cache_position
+        # -------------------------------------------------------------------------- #
+        output_tokens_list = []
+        model_kwargs["input_ids"] = input_ids
+        model_kwargs["cache_position"] = None
+        output_content = ""
+        print()
+        time_list = []
+        for i in range(0, max_new_tokens):
+            # -------------------------------------------------------------------------- #
+            #                     prepare model inputs
+            # -------------------------------------------------------------------------- #
+            model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
+            model_kwargs["cache_position"] = model_inputs["cache_position"]
+            # -------------------------------------------------------------------------- #
+            #                     计算一次
+            # -------------------------------------------------------------------------- #
+            start_time = time.time()
+            logits = self.forward(**model_inputs, return_dict=True)
+            # -------------------------------------------------------------------------- #
+            #                     处理输出
+            # -------------------------------------------------------------------------- #
+            token_scores = logits
+            # -------------------------------------------------------------------------- #
+            #                     random_sample
+            # -------------------------------------------------------------------------- #
+            batch_size, _, vocab_size = token_scores.shape
+            next_tokens = infinicore.empty(
+                (batch_size,),
+                dtype=infinicore.int32,
+                device=token_scores.device,
+            )
+            for i in range(0, batch_size):
+                score = token_scores.narrow(0, i, 1).view([vocab_size])
+                out = next_tokens.narrow(0, i, 1).view([])
+                infinicore.nn.functional.random_sample(
+                    score,
+                    0.8,
+                    0.1,
+                    1,
+                    1.0,
+                    out=out,
+                )
+            end_time = time.time()
+            time_list.append((end_time - start_time) * 1000)
+            # ----------------------------------------------------------------- #
+            #                得到下一个token的id，并解码为字符
+            # ----------------------------------------------------------------- #
+            token_id = next_tokens.to_numpy()[0]
+            output_str = tokenizer.decode([token_id], skip_special_tokens=True)
+            model_kwargs["next_token_id"] = token_id
+            output_tokens_list.append(token_id)
+            output_content += output_str
+            print(output_str, end="", flush=True)
+            if token_id in eos_token_id_list:
+                break
+        print(
+            f"\n\n Time per step: {round(sum(time_list) / len(time_list), 2)} ms\n",
+        )
+        return output_tokens_list, output_content
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
+import os
+from typing import Dict, Optional, Union
+import torch
+from safetensors import safe_open
+# from safetensors.torch import load_file as safe_load_file
+# from safetensors.torch import save_file as safe_save_file
+import infinicore
+str_to_torch_dtype = {
+    "BOOL": torch.bool,
+    "U8": torch.uint8,
+    "I8": torch.int8,
+    "I16": torch.int16,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "I32": torch.int32,
+    "F32": torch.float32,
+    "F64": torch.float64,
+    "I64": torch.int64,
+    "F8_E4M3": torch.float8_e4m3fn,
+    "F8_E5M2": torch.float8_e5m2,
+}
+def load_state_dict(
+    checkpoint_file: Union[str, os.PathLike],
+    map_location: Optional[Union[str, torch.device]] = "cpu",
+    weights_only: bool = True,
+) -> Dict[str, torch.Tensor]:
+    """
+    Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
+    """
+    # Use safetensors if possible
+    if not checkpoint_file.endswith(".safetensors"):
+        return {}
+    state_dict = {}
+    with safe_open(checkpoint_file, framework="pt") as f:
+        metadata = f.metadata()
+        if metadata is not None and metadata.get("format") not in [
+            "pt",
+            "tf",
+            "flax",
+            "mlx",
+        ]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata."
+            )
+        for k in f.keys():
+            if map_location == "meta":
+                _slice = f.get_slice(k)
+                k_dtype = _slice.get_dtype()
+                if k_dtype in str_to_torch_dtype:
+                    dtype = str_to_torch_dtype[k_dtype]
+                else:
+                    raise ValueError(
+                        f"Cannot load safetensors of unknown dtype {k_dtype}"
+                    )
+                state_dict[k] = torch.empty(
+                    size=_slice.get_shape(), dtype=dtype, device="meta"
+                )
+            else:
+                state_dict[k] = f.get_tensor(k)
+    return state_dict
+def get_model_state_dict(
+    model_path: str,
+    device: infinicore.device,
+    dtype=infinicore.dtype,
+) -> Dict[str, infinicore.Tensor]:
+    """
+    Load the model weights.
+    """
+    path = os.path.join(model_path, "model.safetensors")
+    model_param = load_state_dict(path)
+    torch_device = device.type
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+    model_param_infini = {}
+    for key, value in model_param.items():
+        model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
+    for key, value in model_param.items():
+        model_param_infini[key] = infinicore.from_torch(model_param[key])
+    return model_param_infini
--- a/python/infinilm/models/__init__.py
+++ b/python/infinilm/models/__init__.py
+from .llama import *
--- a/python/infinilm/models/llama/__init__.py
+++ b/python/infinilm/models/llama/__init__.py
+from .configuration_llama import *  # noqa: F403
+from .modeling_llama import *  # noqa: F403
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
+# coding=utf-8
+# Copyright (c) 2025, InfiniCore
+#
+# This file contains modified code derived from transformers
+# implementation, which is licensed under the BSD 3-Clause License.
+#
+# The modifications include adaptations for the InfiniCore framework.
+#
+# Original transformers source:
+# https://github.com/huggingface/transformers
+#
+# Referencing PyTorch v4.57.0
+#
+# The use of this file is governed by the BSD 3-Clause License.
+"""LLaMA model configuration"""
+from ...configuration_utils import PretrainedConfig
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    e.g. [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        **kwargs,
+    ):
+        # ---
+        self.model_type = "llama"
+        self.name_or_path = ""
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        # ---
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = (
+            head_dim
+            if head_dim is not None
+            else self.hidden_size // self.num_attention_heads
+        )
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["LlamaConfig"]
--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
+# Copyright (c) 2025, InfiniCore
+#
+# This file contains modified code derived from transformers
+# implementation, which is licensed under the BSD 3-Clause License.
+#
+# The modifications include adaptations for the InfiniCore framework.
+#
+# Original transformers source:
+# https://github.com/huggingface/transformers
+#
+# Referencing PyTorch v4.57.0
+#
+# The use of this file is governed by the BSD 3-Clause License.
+import json
+import os
+from typing import Optional, Union
+from transformers.utils import logging
+import infinicore
+from ...cache_utils import Cache, DynamicCache
+from ...generation.utils import GenerationMixin
+from .configuration_llama import LlamaConfig
+logger = logging.get_logger(__name__)
+def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
+    total_seq_len, num_key_value_heads, head_dim = keys.shape
+    keys_repeat = infinicore.empty(
+        (total_seq_len, num_key_value_heads, ngroup, head_dim),
+        dtype=keys.dtype,
+        device=keys.device,
+    )
+    values_repeat = infinicore.empty(
+        (total_seq_len, num_key_value_heads, ngroup, head_dim),
+        dtype=values.dtype,
+        device=values.device,
+    )
+    for i in range(ngroup):
+        keys_repeat.narrow(2, i, 1).copy_(
+            keys.view((total_seq_len, num_key_value_heads, 1, head_dim))
+        )
+        values_repeat.narrow(2, i, 1).copy_(
+            values.view((total_seq_len, num_key_value_heads, 1, head_dim))
+        )
+    keys_new = keys_repeat.view((total_seq_len, num_key_value_heads * ngroup, head_dim))
+    values_new = values_repeat.view(
+        (total_seq_len, num_key_value_heads * ngroup, head_dim)
+    )
+    return keys_new, values_new
+def multi_head_attention(
+    querys: infinicore.Tensor,  # [seq_len,       num_heads, head_dim]
+    keys: infinicore.Tensor,  #   [total_seq_len, num_heads, head_dim]
+    values: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
+    scaling: float,
+):
+    # => [ num_heads, seq_len,       head_dim]
+    Q = querys.permute((1, 0, 2))
+    # => [ num_heads, total_seq_len, head_dim]
+    K = keys
+    # => [ num_heads, total_seq_len, head_dim]
+    V = values.permute((1, 0, 2))
+    # [num_heads, seq_len, head_dim] @ [ num_heads, head_dim, total_seq_len]
+    # => [ num_heads, seq_len, total_seq_len]
+    attn_weight = Q @ K.permute((1, 2, 0))
+    scaling = infinicore.from_list(
+        [scaling], dtype=attn_weight.dtype, device=attn_weight.device
+    ).as_strided(attn_weight.shape, [0, 0, 0])
+    attn_weight = attn_weight * scaling
+    infinicore.nn.functional.causal_softmax(attn_weight, out=attn_weight)
+    # [ num_heads,  seq_len,  total_seq_len] @ [num_heads, total_seq_len, head_dim]
+    # => [ num_heads,seq_len,head_dim]
+    out = attn_weight @ V
+    # => [seq_len, num_heads, head_dim]
+    return out.permute((1, 0, 2)).contiguous()
+def grouped_query_attention(
+    querys: infinicore.Tensor,  # [seq_len,       num_attention_heads, head_dim]
+    keys: infinicore.Tensor,  #   [total_seq_len, num_key_value_heads, head_dim]
+    values: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
+    scaling: float,
+):
+    num_attention_heads = querys.shape[1]
+    num_key_value_heads = keys.shape[1]
+    ngroup = num_attention_heads // num_key_value_heads
+    if ngroup > 1:
+        keys, values = repeat_kv(keys, values, ngroup)
+    return multi_head_attention(querys, keys, values, scaling=scaling)
+LlamaRMSNorm = infinicore.nn.RMSNorm
+class LlamaMLP(infinicore.nn.Module):
+    def __init__(self, config, **kwargs):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        mlp_bias = config.mlp_bias
+        self.gate_proj = infinicore.nn.Linear(
+            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
+        )
+        self.up_proj = infinicore.nn.Linear(
+            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
+        )
+        self.down_proj = infinicore.nn.Linear(
+            intermediate_size, hidden_size, bias=mlp_bias, **kwargs
+        )
+        self.act_fn = infinicore.nn.functional.silu
+    def forward(self, x: infinicore.Tensor) -> infinicore.Tensor:
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class LlamaAttention(infinicore.nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
+        attention_bias = config.attention_bias
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.num_attention_heads
+        )
+        self.scaling = self.head_dim**-0.5
+        self.q_proj = infinicore.nn.Linear(
+            self.hidden_size,
+            self.num_attention_heads * self.head_dim,
+            bias=attention_bias,
+            **kwargs,
+        )
+        self.k_proj = infinicore.nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=attention_bias,
+            **kwargs,
+        )
+        self.v_proj = infinicore.nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=attention_bias,
+            **kwargs,
+        )
+        self.o_proj = infinicore.nn.Linear(
+            self.num_attention_heads * self.head_dim,
+            self.hidden_size,
+            bias=attention_bias,
+            **kwargs,
+        )
+    def forward(
+        self,
+        hidden_states: infinicore.Tensor,
+        past_key_values: Optional[Cache] = None,
+        rope_instance: infinicore.nn.RoPE = None,
+        **kwargs,
+    ) -> infinicore.Tensor:
+        hidden_states_shape = hidden_states.shape  # [bs, seq_len, hidden_size]
+        bs, seq_len = hidden_states_shape[:-1]  #    [bs, seq_len]
+        querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
+        keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
+        values_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
+        # --------------------------------------------------------------------------------------- #
+        #                           对 Q,K，V进行 project
+        # --------------------------------------------------------------------------------------- #
+        # => [bs, seq_len,  num_attention_heads, head_dim]
+        query_states = self.q_proj(hidden_states).view(querys_shape)
+        # => [bs, seq_len,  num_key_value_heads, head_dim]
+        key_states = self.k_proj(hidden_states).view(keys_shape)
+        # => [bs, seq_len, nkvh, head_dim]
+        value_states = self.v_proj(hidden_states).view(values_shape)
+        # --------------------------------------------------------------------------------------- #
+        #                           对 Q和K， 加上 rope
+        # --------------------------------------------------------------------------------------- #
+        cache_position = kwargs.pop("cache_position", None)
+        if cache_position is None:
+            raise KeyError("cache_position error")
+        if rope_instance is None:
+            raise KeyError("rope_instance error")
+        query_states = rope_instance(query_states, cache_position)
+        key_states = rope_instance(key_states, cache_position)
+        # --------------------------------------------------------------------------------------- #
+        #                           kv cache
+        # --------------------------------------------------------------------------------------- #
+        if past_key_values is not None:
+            cache_kwargs = {}
+            key_states_total, value_states_total = past_key_values.update(
+                key_states,  # [bs, seq_len, num_key_value_heads, head_dim]
+                value_states,  # [bs, seq_len, num_key_value_heads, head_dim]
+                self.layer_idx,
+                cache_kwargs,
+            )
+        # --------------------------------------------------------------------------------------- #
+        #                           注意力计算
+        # --------------------------------------------------------------------------------------- #
+        total_seq_len = key_states_total.shape[1]
+        attn_output = infinicore.empty_like(query_states)
+        for i in range(0, bs):
+            query_states_i = query_states.narrow(0, i, 1).view(
+                (seq_len, self.num_attention_heads, self.head_dim)
+            )
+            key_states_i = key_states_total.narrow(0, i, 1).view(
+                (total_seq_len, self.num_key_value_heads, self.head_dim)
+            )
+            value_states_i = value_states_total.narrow(0, i, 1).view(
+                (total_seq_len, self.num_key_value_heads, self.head_dim)
+            )
+            attn_output_i = attn_output.narrow(0, i, 1).view(
+                (seq_len, self.num_attention_heads, self.head_dim)
+            )
+            attention_i = grouped_query_attention(
+                query_states_i, key_states_i, value_states_i, scaling=self.scaling
+            )
+            attn_output_i.copy_(attention_i)
+        # --------------------------------------------------------------------------------------- #
+        #                           out project
+        # --------------------------------------------------------------------------------------- #
+        # ([bs, seq_len, num_attention_heads, head_dim]) ==> [bs, seq_len, hidden_size ]
+        attn_output = attn_output.view(hidden_states_shape)
+        # o_proj
+        return self.o_proj(attn_output)
+class LlamaDecoderLayer(infinicore.nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
+        super().__init__()
+        hidden_size = config.hidden_size
+        rms_norm_eps = config.rms_norm_eps
+        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx, **kwargs)
+        self.mlp = LlamaMLP(config=config, **kwargs)
+        self.input_layernorm = LlamaRMSNorm(hidden_size, eps=rms_norm_eps, **kwargs)
+        self.post_attention_layernorm = LlamaRMSNorm(
+            hidden_size, eps=rms_norm_eps, **kwargs
+        )
+    def forward(
+        self,
+        hidden_states: infinicore.Tensor,  # [bs, seq_len, hidden_size]
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        rope_instance=None,
+        **kwargs,
+    ) -> infinicore.Tensor:
+        # ------------------------------------------------ #
+        #          Self Attention
+        # ------------------------------------------------ #
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            rope_instance=rope_instance,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # ------------------------------------------------ #
+        #           Fully Connected
+        # ------------------------------------------------ #
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class LlamaModel(infinicore.nn.Module):
+    def __init__(self, config: LlamaConfig, **kwargs):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.embed_tokens = infinicore.nn.Embedding(
+            config.vocab_size, config.hidden_size, **kwargs
+        )
+        self.layers = infinicore.nn.ModuleList(
+            [
+                LlamaDecoderLayer(config, layer_idx, **kwargs)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps, **kwargs)
+        self.rope_instance = infinicore.nn.RoPE(
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            head_dim=head_dim,
+            **kwargs,
+        )
+    def forward(
+        self,
+        input_ids,
+        cache_position,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,  # True
+        **kwargs,
+    ):
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        # --------------------------------------------------------- #
+        #               token的embedding
+        # --------------------------------------------------------- #
+        # input_ids :     {1,5}       tensor([[    1,  1128,   526,   366, 29892]])
+        # inputs_embeds : {1,5,2048}  tensor([[[...]]])
+        inputs_embeds = self.embed_tokens(input_ids)
+        # --------------------------------------------------------- #
+        #                    decoder_layer
+        # --------------------------------------------------------- #
+        ilayer = 0  # noqa: F841
+        hidden_states = inputs_embeds
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            # print("ilayer: ", ilayer)
+            # ilayer += 1
+            hidden_states = decoder_layer(
+                hidden_states,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                rope_instance=self.rope_instance,
+                **kwargs,
+            )
+        # --------------------------------------------------------- #
+        #                    norm
+        # --------------------------------------------------------- #
+        seq_len = hidden_states.shape[1]
+        last_token = hidden_states.narrow(1, seq_len - 1, 1)
+        return self.norm(last_token)
+class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
+    config: LlamaConfig
+    def __init__(self, config, **kwargs):
+        super().__init__()
+        self.config = config
+        self.model = LlamaModel(config, **kwargs)
+        self.lm_head = infinicore.nn.Linear(
+            config.hidden_size,
+            config.vocab_size,
+            bias=False,
+            **kwargs,
+        )
+    def forward(
+        self,
+        input_ids,
+        cache_position,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
+        last_token = self.model(
+            input_ids,
+            cache_position,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        return self.lm_head(last_token)
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: Optional[Union[str, os.PathLike]],
+        device: infinicore.device,
+        dtype=infinicore.dtype,
+    ):
+        def load_config_json(dir_path_: str):
+            with open(os.path.join(dir_path_, "config.json"), "r") as f:
+                config = json.load(f)
+            return config
+        config_dict = load_config_json(os.path.join(model_path))
+        config = LlamaConfig(**config_dict)
+        return LlamaForCausalLM(config, device=device, dtype=dtype)
+__all__ = [
+    "LlamaModel",
+    "LlamaForCausalLM",
+]