Merge pull request #105 from pengcheng888/issue/102

issue/102 - 添加逐文件和逐tensor 读取权重文件的函数

Merge pull request #105 from pengcheng888/issue/102
issue/102 - 添加逐文件和逐tensor 读取权重文件的函数
39bea30a · pengcheng888 · GitHub · 8c224092 · 7128a9a5 · 39bea30a
Unverified Commit 39bea30a authored Dec 07, 2025 by pengcheng888 Committed by GitHub Dec 07, 2025
11 changed files
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -30,6 +30,16 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor &
    }
 }

+//------------------------------------------------------
+// state_dict
+//------------------------------------------------------
+std::unordered_map<std::string, infinicore::nn::Parameter> InferEngine::state_dict() {
+    if (0 == workers_.size()) {
+        throw std::runtime_error(" Model object not found. ");
+    }
+    return workers_[0]->state_dict();
+}
+
 //------------------------------------------------------
 // generate
 //------------------------------------------------------

--- a/csrc/engine/infer_engine.hpp
+++ b/csrc/engine/infer_engine.hpp
@@ -19,6 +19,9 @@ public:
    // Load a parameter to all workers (each can extract its shard inside RankWorker)
    void load_param(const std::string &name, const infinicore::Tensor &param);

+    // return the parameters (i.e. weights and biases).
+    std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
+
    // Run a single forward pass on all workers and return the outputs from all ranks
    infinicore::Tensor generate(const infinicore::Tensor &input_ids,
                                const infinicore::Tensor &position_ids);

--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -74,6 +74,13 @@ void RankWorker::load_param(const std::string &name,
    }
 }

+//------------------------------------------------------
+// state_dict --
+//------------------------------------------------------
+std::unordered_map<std::string, infinicore::nn::Parameter> RankWorker::state_dict() {
+    return this->model_->state_dict();
+}
+
 //------------------------------------------------------
 // run -- asynchronous
 //------------------------------------------------------

--- a/csrc/engine/rank_worker.hpp
+++ b/csrc/engine/rank_worker.hpp
@@ -28,6 +28,9 @@ public:
    void load_param(const std::string &name,
                    const infinicore::Tensor &param);

+    // return the parameters (i.e. weights and biases).
+    std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
+
    // Submit a run (forward) job.
    void run(const std::vector<std::any> &args);


--- a/csrc/pybind11/engine.hpp
+++ b/csrc/pybind11/engine.hpp
@@ -42,11 +42,16 @@ inline void bind_infer_engine(py::module &m) {
        .def("load_param", &InferEngine::load_param,
             py::arg("name"), py::arg("param"),
             "Load a parameter tensor into all workers (each worker picks its shard)")
-        .def(
-            "generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor {
-                return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>());
-            },
-            "Run inference on all ranks with arbitrary arguments");
+        .def("state_dict", [](InferEngine &self) {
+            // Return a dictionary containing references to the whole state of the module.
+            auto state_dict = self.state_dict();
+            py::dict result;
+            for (const auto &[name, param] : state_dict) {
+                result[py::cast(name)] = infinicore::Tensor(param);
+            }
+            return result;
+        })
+        .def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments");

    // Optionally, you can add __repr__ for debugging
    m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) {

--- a/csrc/pybind11/models/llama.hpp
+++ b/csrc/pybind11/models/llama.hpp
@@ -172,20 +172,15 @@ inline void bind_llama(py::module &m) {
             }),
             py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
        .def("state_dict", [](const LlamaForCausalLM &model) {
-            // Convert state_dict to Python dict with shape information
+            // Return a dictionary containing references to the whole state of the module.
            auto state_dict = model.state_dict();
            py::dict result;
            for (const auto &[name, param] : state_dict) {
-                // Parameter is a shared_ptr<Tensor>, get shape from it
-                py::dict param_info;
-                param_info["shape"] = py::cast(param->shape());
-                param_info["dtype"] = py::cast(static_cast<int>(param->dtype()));
-                result[py::cast(name)] = param_info;
+                result[py::cast(name)] = infinicore::Tensor(param);
            }
            return result;
        })
-        .def(
-            "get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
+        .def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
                // Get actual tensor parameter by name
                auto state_dict = model.state_dict();
                auto it = state_dict.find(name);
@@ -194,11 +189,8 @@ inline void bind_llama(py::module &m) {
                    const infinicore::Tensor &tensor = it->second;
                    return tensor;
                }
-                throw std::runtime_error("Parameter '" + name + "' not found in model");
-            },
-            py::arg("name"))
-        .def(
-            "load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
+                throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
+        .def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
                // Convert Python dict to C++ state_dict
                std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
                for (auto item : state_dict) {
@@ -206,12 +198,9 @@ inline void bind_llama(py::module &m) {
                    py::object value = item.second.cast<py::object>();
                    cpp_state_dict.emplace(key, convert_to_tensor(value, device));
                }
-                model.load_state_dict(cpp_state_dict);
-            },
-            py::arg("state_dict"), py::arg("device"))
+                model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"), py::arg("device"))
        .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
-        .def(
-            "forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
+        .def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
                // Helper to extract C++ tensor from Python object
                auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
                    // If it's already a Python InfiniCore tensor wrapper, extract underlying
@@ -247,8 +236,8 @@ inline void bind_llama(py::module &m) {
                // Handle kv_caches if provided
                std::vector<void *> *kv_caches_ptr = nullptr;

-                return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr);
-            },
+                return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); },
+             //
             py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
 }


--- a/examples/jiuge.py
+++ b/examples/jiuge.py
+import infinicore
+from transformers import AutoTokenizer
+from tokenizers import decoders as _dec
+from infinilm.modeling_utils import load_model_state_dict_by_file
+import infinilm
+from infinilm.distributed import DistConfig
+import argparse
+import sys
+import time
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="run Llama args")
+
+    parser.add_argument(
+        "--cpu",
+        action="store_true",
+        help="Run cpu test",
+    )
+    parser.add_argument(
+        "--nvidia",
+        action="store_true",
+        help="Run nvidia test",
+    )
+    parser.add_argument(
+        "--metax",
+        action="store_true",
+        help="Run metax test",
+    )
+    parser.add_argument(
+        "--moore",
+        action="store_true",
+        help="Run moore test",
+    )
+    parser.add_argument(
+        "--iluvatar",
+        action="store_true",
+        help="Run iluvatar test",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="model_path",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=100,
+        help="max_new_tokens",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="python",
+        help="python or cpp model",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        help="float32, float16, bfloat16",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="number of prompts in a batch",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="How are you",
+        help="input prompt",
+    )
+    parser.add_argument(
+        "--tp",
+        type=int,
+        default=None,
+        help="total rank for tensor parallel",
+    )
+
+    return parser.parse_args()
+
+
+def test(
+    prompts: str | list[str],
+    model_path,
+    max_new_tokens=100,
+    infini_dtype=infinicore.bfloat16,
+    infini_device=infinicore.device("cpu", 0),
+    backend="python",
+):
+    model_path = os.path.expanduser(model_path)
+    # ---------------------------------------------------------------------------- #
+    #                        创建模型,
+    # ---------------------------------------------------------------------------- #
+    model = infinilm.AutoLlamaModel.from_pretrained(
+        model_path,
+        device=infini_device,
+        dtype=infini_dtype,
+        backend=backend,
+        distributed_config=DistConfig(args.tp),
+    )
+
+    # ---------------------------------------------------------------------------- #
+    #                        加载权重
+    # ---------------------------------------------------------------------------- #
+    load_model_state_dict_by_file(model, model_path, dtype=infini_dtype)
+
+    # ---------------------------------------------------------------------------- #
+    #                        创建 tokenizer
+    # ---------------------------------------------------------------------------- #
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+  
+    if "llama" == model.config.model_type:
+        backend = getattr(tokenizer, "backend_tokenizer", None)
+        target = getattr(backend, "_tokenizer", backend)
+        norm = getattr(target, "normalizer", None)
+        dec = getattr(target, "decoder", None)
+        sn = repr(norm)[:800] if norm is not None else ""
+        sd = repr(dec)[:800] if dec is not None else ""
+        has_prepend = "Prepend" in sn
+        has_strip = "Strip" in sd
+        if has_prepend and has_strip:
+            target.decoder = _dec.Sequence(
+                [
+                    _dec.Replace("▁", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ]
+            )
+
+
+    # ---------------------------------------------------------------------------- #
+    #                        token编码
+    # ---------------------------------------------------------------------------- #
+    # prompt = "山东最高的山是？"
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
+    print(input_contents[0], end="", flush=True)
+    input_ids_list = tokenizer.batch_encode_plus(input_contents)[
+        "input_ids"
+    ]  # List: [[1, 1128, 526, 366, 29892]]
+
+    # ---------------------------------------------------------------------------- #
+    #                        自回归生成
+    # ---------------------------------------------------------------------------- #
+    input_ids_infini = infinicore.from_list(input_ids_list)
+
+    t1 = time.time()
+    print("=================== start generate ====================")
+    model.generate(
+        input_ids_infini,
+        max_new_tokens=max_new_tokens,
+        device=infini_device,
+        tokenizer=tokenizer,
+    )
+    t2 = time.time()
+
+    print(
+        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
+    )
+
+
+if __name__ == "__main__":
+    args = get_args()
+    print(args)
+
+    # Parse command line arguments
+    device_str = "cpu"
+    if args.cpu:
+        device_str = "cpu"
+    elif args.nvidia:
+        device_str = "cuda"
+    elif args.metax:
+        device_str = "cuda"
+    elif args.moore:
+        device_str = "musa"
+    elif args.iluvatar:
+        device_str = "cuda"
+    else:
+        print(
+            "Usage:  python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
+            "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
+        )
+        sys.exit(1)
+    prompts = [args.prompt for _ in range(args.batch_size)]
+
+    model_path = args.model_path
+    max_new_tokens = args.max_new_tokens
+    backend = args.backend
+
+    infini_device = infinicore.device(device_str, 0)
+    if args.dtype == "float32":
+        infini_dtype = infinicore.float32
+    elif args.dtype == "bfloat16":
+        infini_dtype = infinicore.bfloat16
+    elif args.dtype == "float16":
+        infini_dtype = infinicore.float16
+    else:
+        raise ValueError(f"Unsupported dtype: {args.dtype}")
+
+    test(
+        prompts,
+        model_path,
+        max_new_tokens,
+        infini_device=infini_device,
+        infini_dtype=infini_dtype,
+        backend=backend,
+    )
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -3,7 +3,6 @@ from transformers import AutoTokenizer
 from tokenizers import decoders as _dec
 from infinilm.modeling_utils import get_model_state_dict
 import infinilm
-from infinilm.distributed import DistConfig
 import argparse
 import sys
 import time
@@ -76,12 +75,6 @@ def get_args():
        default="How are you",
        help="input prompt",
    )
-    parser.add_argument(
-        "--tp",
-        type=int,
-        default=None,
-        help="total rank for tensor parallel",
-    )

    return parser.parse_args()

@@ -103,7 +96,6 @@ def test(
        device=infini_device,
        dtype=infini_dtype,
        backend=backend,
-        distributed_config=DistConfig(args.tp),
    )

    # ---------------------------------------------------------------------------- #
@@ -115,7 +107,7 @@ def test(
        dtype=infini_dtype,
    )

-    model.load_state_dict(model_param_infini)
+    model.load_state_dict(model_param_infini, strict=True)

    # ---------------------------------------------------------------------------- #
    #                        创建 tokenizer

--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
 import os
-from typing import Dict, Optional, Union
+from typing import Dict, Union

 import torch
 from safetensors import safe_open
@@ -23,15 +23,39 @@ str_to_torch_dtype = {
 }


+def check_parameters(model_keys: list, already_loaded_keys: list):
+    model_keys = set(model_keys)
+    already_loaded_keys = set(already_loaded_keys)
+    intersection = model_keys & already_loaded_keys
+
+    missing_keys = model_keys - intersection
+    unexpected_keys = already_loaded_keys - intersection
+    error_msgs: list[str] = []
+
+    if len(unexpected_keys) > 0:
+        error_msgs.insert(
+            0,
+            "Unexpected key(s) in state_dict: {}. ".format(
+                ", ".join('"{}"'.format(k) for k in unexpected_keys)
+            ),
+        )
+    if len(missing_keys) > 0:
+        error_msgs.insert(
+            0,
+            "Missing key(s) in state_dict: {}. ".format(
+                ", ".join('"{}"'.format(k) for k in missing_keys)
+            ),
+        )
+    return error_msgs
+
+
 def load_state_dict(
-    checkpoint_file: Union[str, os.PathLike],
-    map_location: Optional[Union[str, torch.device]] = "cpu",
-    weights_only: bool = True,
+    checkpoint_file: Union[str, os.PathLike], device="cpu", dtype=torch.bfloat16
 ) -> Dict[str, torch.Tensor]:
    """
    Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
    """
-    # Use safetensors if possible
+
    if not checkpoint_file.endswith(".safetensors"):
        return {}

@@ -49,20 +73,7 @@ def load_state_dict(
            )

        for k in f.keys():
-            if map_location == "meta":
-                _slice = f.get_slice(k)
-                k_dtype = _slice.get_dtype()
-                if k_dtype in str_to_torch_dtype:
-                    dtype = str_to_torch_dtype[k_dtype]
-                else:
-                    raise ValueError(
-                        f"Cannot load safetensors of unknown dtype {k_dtype}"
-                    )
-                state_dict[k] = torch.empty(
-                    size=_slice.get_shape(), dtype=dtype, device="meta"
-                )
-            else:
-                state_dict[k] = f.get_tensor(k)
+            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)

    return state_dict

@@ -75,30 +86,93 @@ def get_model_state_dict(
    """
    Load the model weights.
    """
+    torch_device = device.type
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+
    # --------------------------------------------------------- #
-    #          使用从 *.safetensors文件中加载权重
+    #          Load weights from  all *.safetensors files
    # --------------------------------------------------------- #
    model_param = {}
    for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
-        model_param.update(load_state_dict(file_path))
+        model_param.update(
+            load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
+        )

    if model_param.get("lm_head.weight", None) is None:
        model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]

    # --------------------------------------------------------- #
-    #          调整权重的device和dtype
+    #         model_param_infini references torch.Tensor
    # --------------------------------------------------------- #
-    torch_device = device.type
+    model_param_infini = {}
+    for key in model_param.keys():
+        model_param_infini[key] = infinicore.from_torch(model_param[key])
+
+    return model_param_infini
+
+
+def load_model_state_dict_by_file(
+    model: infinicore.nn.Module,
+    model_path: str,
+    dtype=infinicore.dtype,
+) -> Dict[str, infinicore.Tensor]:
+    """
+    Load the model weights by file.
+    """
+    torch_device = "cpu"
    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+    model_keys = model.state_dict().keys()

-    model_param_infini = {}
-    for key, value in model_param.items():
-        model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
+    already_loaded_keys = []
+    for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
+        # --------------------------------------------------------- #
+        #          Load weights from *.safetensors file
+        # --------------------------------------------------------- #
+        model_param = load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
+        already_loaded_keys.extend(model_param.keys())

        # --------------------------------------------------------- #
-    #           model_param_infini 引用torch.Tensor
+        #         model_param_infini references torch.Tensor
        # --------------------------------------------------------- #
-    for key, value in model_param.items():
+        model_param_infini = {}
+        for key in model_param.keys():
            model_param_infini[key] = infinicore.from_torch(model_param[key])

-    return model_param_infini
+        model.load_state_dict(model_param_infini, strict=False)
+        infinicore.sync_device()
+
+    error_msgs = check_parameters(model_keys, already_loaded_keys)
+    if len(error_msgs) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+        )
+
+
+def load_model_state_dict_by_tensor(
+    model: infinicore.nn.Module,
+    model_path: str,
+    dtype=infinicore.dtype,
+):
+    """
+    Load the model weights by tensor.
+    """
+
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+    model_keys = model.state_dict().keys()
+    already_loaded_keys = []
+
+    for file in glob.glob(os.path.join(model_path, "*.safetensors")):
+        with safe_open(file, "pt", "cpu") as f:
+            for name in f.keys():
+                param_infini = infinicore.from_torch(
+                    f.get_tensor(name).to(dtype=torch_dtype)
+                )
+                model.load_parameter(name, param_infini)
+                already_loaded_keys.append(name)
+                infinicore.sync_stream()
+
+    error_msgs = check_parameters(model_keys, already_loaded_keys)
+    if len(error_msgs) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+        )
--- a/python/infinilm/models/llama/__init__.py
+++ b/python/infinilm/models/llama/__init__.py
@@ -18,6 +18,10 @@ class AutoLlamaModel:
        if backend == "python":
            from . import modeling_llama

+            print("\n***************************************************************")
+            print("\t\t Loading Llama Model with Python Backend")
+            print(f"\t\t Device: {device}, DType: {dtype}")
+            print("***************************************************************\n")
            return modeling_llama.LlamaForCausalLM.from_pretrained(
                model_path,
                device=device,
@@ -28,6 +32,10 @@ class AutoLlamaModel:
        elif backend == "cpp":
            from .backends import cpp

+            print("\n***************************************************************")
+            print("\t\tLoading Llama Model with C++ Backend")
+            print(f"\t\tDevice: {device}, DType: {dtype}")
+            print("***************************************************************\n")
            return cpp.LlamaForCausalLM.from_pretrained(
                model_path,
                device=device,

--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
@@ -6,6 +6,7 @@ from infinilm.distributed import DistConfig
 import json
 import os
 from typing import Optional, Union
+from collections import OrderedDict


 class LlamaConfig:
@@ -120,9 +121,12 @@ class LlamaForCausalLM(GenerationMixin):

    def state_dict(self):
        """Get model state dictionary with parameter shapes"""
-        return self._model.state_dict()
+        destination = OrderedDict()
+        for name, param in self._model.state_dict().items():
+            destination[name] = infinicore.Tensor(param)
+        return destination

-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict, strict=None):
        """
        Load state dictionary into the model