issue/83 - 添加AutoLlama类，支持创建不同backend的模型

e4114c03 · pengcheng888 · 5d182420 · e4114c03 · e4114c03 · e4114c03
Commit e4114c03 authored Nov 26, 2025 by pengcheng888
10 changed files
--- a/README.md
+++ b/README.md
@@ -49,5 +49,9 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
 - 单次推理测试
    - llama示例
 ```bash
-python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>
+python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+```
+例如：
+```bash
+python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
 ```
\ No newline at end of file
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -53,20 +53,28 @@ def get_args():
        default=100,
        help="max_new_tokens",
    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="python",
+        help="python or cpp model",
+    )
    return parser.parse_args()


-def test(model_path, device_str="cuda", max_new_tokens=100):
+def test(
+    prompt,
+    model_path,
+    max_new_tokens=100,
+    infini_dtype=infinicore.bfloat16,
+    infini_device=infinicore.device("cpu", 0),
+    backend="python",
+):
    # ---------------------------------------------------------------------------- #
    #                        创建模型,
    # ---------------------------------------------------------------------------- #
-    infini_device = infinicore.device(device_str, 0)
-    infini_dtype = infinicore.bfloat16
-
-    model = infinilm.LlamaForCausalLM.from_pretrained(
-        model_path,
-        device=infini_device,
-        dtype=infini_dtype,
+    model = infinilm.AutoLlamaModel.from_pretrained(
+        model_path, device=infini_device, dtype=infini_dtype, backend=backend
    )

    # ---------------------------------------------------------------------------- #
@@ -85,7 +93,6 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
    # ---------------------------------------------------------------------------- #
    #                        创建 tokenizer
    # ---------------------------------------------------------------------------- #
-
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if "llama" == config.model_type:
@@ -109,7 +116,7 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
    # ---------------------------------------------------------------------------- #
    #                        token编码
    # ---------------------------------------------------------------------------- #
-    prompt = "山东最高的山是？"
+    # prompt = "山东最高的山是？"
    input_content = tokenizer.apply_chat_template(
        conversation=[{"role": "user", "content": prompt}],
        add_generation_prompt=True,
@@ -144,24 +151,37 @@ if __name__ == "__main__":
    print(args)

    # Parse command line arguments
-    device_type = "cpu"
+    device_str = "cpu"
    if args.cpu:
-        device_type = "cpu"
+        device_str = "cpu"
    elif args.nvidia:
-        device_type = "cuda"
+        device_str = "cuda"
    elif args.metax:
-        device_type = "cuda"
+        device_str = "cuda"
    elif args.moore:
-        device_type = "musa"
+        device_str = "musa"
    elif args.iluvatar:
-        device_type = "cuda"
+        device_str = "cuda"
    else:
        print(
-            "Usage:  python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>"
+            "Usage:  python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
+            "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
        )
        sys.exit(1)
+    prompt = "山东最高的山是？"

    model_path = args.model_path
    max_new_tokens = args.max_new_tokens
+    backend = args.backend
+
+    infini_device = infinicore.device(device_str, 0)
+    infini_dtype = infinicore.bfloat16

-    test(model_path, device_type, max_new_tokens)
+    test(
+        prompt,
+        model_path,
+        max_new_tokens,
+        infini_device=infini_device,
+        infini_dtype=infini_dtype,
+        backend=backend,
+    )
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
-from .models import *
+from .models import AutoLlamaModel
+
+__all__ = ["AutoLlamaModel"]
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
@@ -43,17 +43,17 @@ infinicore.Tensor.to_numpy = infini_to_numpy


 class GenerationMixin:
-    def _get_initial_cache_position(
+    def _get_initial_position_ids(
        self,
        bs: int,
        seq_length: int,
        device: infinicore.device,
    ) -> infinicore.Tensor:
-        """Calculates `cache_position` for the pre-fill stage"""
-        cache_position_list = [list(range(0, seq_length)) for i in range(bs)]
+        """Calculates `position_ids` for the pre-fill stage"""
+        position_ids_list = [list(range(0, seq_length)) for i in range(bs)]

        return infinicore.from_list(
-            cache_position_list, dtype=infinicore.int64, device=device
+            position_ids_list, dtype=infinicore.int64, device=device
        )

    def prepare_inputs_for_generation(
@@ -73,29 +73,29 @@ class GenerationMixin:
            model_inputs["past_key_values"] = past_key_values

        # -------------------------------------------------------------------------- #
-        #                     计算所需的，cache_position
+        #                     计算所需的，position_ids
        # -------------------------------------------------------------------------- #
-        current_cache_position = kwargs.get("cache_position", None)
-        if current_cache_position is None:
+        current_position_ids = kwargs.get("position_ids", None)
+        if current_position_ids is None:
            # prill阶段
            bs, seq_len = kwargs["input_ids"].shape[0:2]
-            model_inputs["cache_position"] = self._get_initial_cache_position(
+            model_inputs["position_ids"] = self._get_initial_position_ids(
                bs, seq_len, device
            )

        else:
            # decoder 阶段
-            bs, seq_len = current_cache_position.shape
-            last_position = current_cache_position.narrow(1, seq_len - 1, 1)
+            bs, seq_len = current_position_ids.shape
+            last_position = current_position_ids.narrow(1, seq_len - 1, 1)

            one_value = infinicore.from_list(
-                [1],
+                [1] * bs,
                dtype=last_position.dtype,
                device=last_position.device,
            ).view((bs, 1))

            next_position = one_value + last_position
-            model_inputs["cache_position"] = next_position
+            model_inputs["position_ids"] = next_position

        # -------------------------------------------------------------------- #
        #                 所需的: token的input_ids
@@ -127,8 +127,12 @@ class GenerationMixin:
        # -------------------------------------------------------------------- #
        #                       创建 cache                                      #
        # -------------------------------------------------------------------- #
-        model_kwargs["use_cache"] = True
-        model_kwargs["past_key_values"] = DynamicCache(config=self.config)
+        if self.use_cache:
+            model_kwargs["use_cache"] = True
+            model_kwargs["past_key_values"] = DynamicCache(config=self.config)
+        else:
+            model_kwargs["use_cache"] = False
+            model_kwargs["past_key_values"] = None

        # -------------------------------------------------------------------- #
        #                       _sample函数                                     #
@@ -170,12 +174,12 @@ class GenerationMixin:
        )

        # -------------------------------------------------------------------------- #
-        #                     初始化 cache_position
+        #                     初始化 position_ids
        # -------------------------------------------------------------------------- #
        output_tokens_list = []

        model_kwargs["input_ids"] = input_ids
-        model_kwargs["cache_position"] = None
+        model_kwargs["position_ids"] = None
        output_content = ""
        print()

@@ -186,13 +190,13 @@ class GenerationMixin:
            # -------------------------------------------------------------------------- #
            model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)

-            model_kwargs["cache_position"] = model_inputs["cache_position"]
+            model_kwargs["position_ids"] = model_inputs["position_ids"]

            # -------------------------------------------------------------------------- #
            #                     计算一次
            # -------------------------------------------------------------------------- #
            start_time = time.time()
-            logits = self.forward(**model_inputs, return_dict=True)
+            logits = self(**model_inputs)

            # -------------------------------------------------------------------------- #
            #                     处理输出
@@ -237,8 +241,12 @@ class GenerationMixin:
            if token_id in eos_token_id_list:
                break

+        print("\n</s>")
+        print(
+            f"\n\n\n Time per step:  prefill {round(time_list[0], 2)} token/ms\n",
+        )
        print(
-            f"\n\n Time per step: {round(sum(time_list) / len(time_list), 2)} ms\n",
+            f" Time per step:  decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
        )

        return output_tokens_list, output_content
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -3,9 +3,8 @@ from typing import Dict, Optional, Union

 import torch
 from safetensors import safe_open
+import glob

-# from safetensors.torch import load_file as safe_load_file
-# from safetensors.torch import save_file as safe_save_file
 import infinicore

 str_to_torch_dtype = {
@@ -76,9 +75,19 @@ def get_model_state_dict(
    """
    Load the model weights.
    """
-    path = os.path.join(model_path, "model.safetensors")
-    model_param = load_state_dict(path)
+    # --------------------------------------------------------- #
+    #          使用从 *.safetensors文件中加载权重
+    # --------------------------------------------------------- #
+    model_param = {}
+    for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
+        model_param.update(load_state_dict(file_path))

+    if model_param.get("lm_head.weight", None) is None:
+        model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
+
+    # --------------------------------------------------------- #
+    #          调整权重的device和dtype
+    # --------------------------------------------------------- #
    torch_device = device.type
    torch_dtype = infinicore.utils.to_torch_dtype(dtype)

@@ -86,6 +95,9 @@ def get_model_state_dict(
    for key, value in model_param.items():
        model_param[key] = value.to(device=torch_device, dtype=torch_dtype)

+    # --------------------------------------------------------- #
+    #           model_param_infini 引用torch.Tensor
+    # --------------------------------------------------------- #
    for key, value in model_param.items():
        model_param_infini[key] = infinicore.from_torch(model_param[key])


--- a/python/infinilm/models/__init__.py
+++ b/python/infinilm/models/__init__.py
-from .llama import *
+from .llama import AutoLlamaModel
+
+__all__ = ["AutoLlamaModel"]
--- a/python/infinilm/models/llama/__init__.py
+++ b/python/infinilm/models/llama/__init__.py
-from .configuration_llama import *  # noqa: F403
-from .modeling_llama import *  # noqa: F403
+import os
+from typing import Optional, Union
+import infinicore
+
+__all__ = ["AutoLlamaModel"]
+
+
+class AutoLlamaModel:
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: Optional[Union[str, os.PathLike]],
+        device: infinicore.device,
+        dtype=infinicore.dtype,
+        backend="python",
+    ):
+        if backend == "python":
+            from . import modeling_llama
+
+            return modeling_llama.LlamaForCausalLM.from_pretrained(
+                model_path,
+                device=device,
+                dtype=dtype,
+            )
+
+        elif backend == "cpp":
+            from .backends import cpp
+
+            return cpp.LlamaForCausalLM.from_pretrained(
+                model_path,
+                device=device,
+                dtype=dtype,
+            )
+
+        raise KeyError("invalid backend")
--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
+from ....generation.utils import GenerationMixin
+import infinicore
+import os
+from typing import Optional, Union
+
+
+class LlamaForCausalLM(GenerationMixin):
+    def __init__(self):
+        super().__init__()
+        self.use_cache = False
+        self._model = None
+        raise NotImplementedError("NotImplementedError!!")
+
+    def forward(self, input_ids, position_ids, *args, **kwargs):
+        kv_caches = None
+        return infinicore.Tensor(
+            self._model.forward(input_ids, position_ids, kv_caches)
+        )
+
+    def __call__(self, input_ids, position_ids, *args, **kwargs):
+        return self.forward(input_ids=input_ids, position_ids=position_ids)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: Union[str, os.PathLike],
+        device: infinicore.device,
+        dtype=infinicore.dtype,
+    ):
+        """
+        Load a pretrained LlamaForCausalLM model from a directory.
+        Args:
+            model_path: Path to the model directory containing config.json
+            device: Device instance (defaults to CPU)
+        Returns:
+            LlamaForCausalLM instance
+        """
+        raise NotImplementedError("NotImplementedError!!")
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
@@ -228,6 +228,3 @@ class LlamaConfig(PretrainedConfig):
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
-
-
-__all__ = ["LlamaConfig"]
--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
@@ -196,14 +196,14 @@ class LlamaAttention(infinicore.nn.Module):
        # --------------------------------------------------------------------------------------- #
        #                           对 Q和K， 加上 rope
        # --------------------------------------------------------------------------------------- #
-        cache_position = kwargs.pop("cache_position", None)
-        if cache_position is None:
-            raise KeyError("cache_position error")
+        position_ids = kwargs.pop("position_ids", None)
+        if position_ids is None:
+            raise KeyError("position_ids error")
        if rope_instance is None:
            raise KeyError("rope_instance error")

-        query_states = rope_instance(query_states, cache_position)
-        key_states = rope_instance(key_states, cache_position)
+        query_states = rope_instance(query_states, position_ids)
+        key_states = rope_instance(key_states, position_ids)

        # --------------------------------------------------------------------------------------- #
        #                           kv cache
@@ -338,7 +338,7 @@ class LlamaModel(infinicore.nn.Module):
    def forward(
        self,
        input_ids,
-        cache_position,
+        position_ids,
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = None,  # True
        **kwargs,
@@ -364,7 +364,7 @@ class LlamaModel(infinicore.nn.Module):
            hidden_states = decoder_layer(
                hidden_states,
                past_key_values=past_key_values,
-                cache_position=cache_position,
+                position_ids=position_ids,
                rope_instance=self.rope_instance,
                **kwargs,
            )
@@ -384,6 +384,8 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
    def __init__(self, config, **kwargs):
        super().__init__()
        self.config = config
+        self.use_cache = True
+
        self.model = LlamaModel(config, **kwargs)
        self.lm_head = infinicore.nn.Linear(
            config.hidden_size,
@@ -395,14 +397,14 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
    def forward(
        self,
        input_ids,
-        cache_position,
+        position_ids,
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = None,
        **kwargs,
    ):
        last_token = self.model(
            input_ids,
-            cache_position,
+            position_ids,
            past_key_values=past_key_values,
            use_cache=use_cache,
            **kwargs,
@@ -425,9 +427,3 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
        config = LlamaConfig(**config_dict)

        return LlamaForCausalLM(config, device=device, dtype=dtype)
-
-
-__all__ = [
-    "LlamaModel",
-    "LlamaForCausalLM",
-]