Merge pull request #119 from pengcheng888/issue/112

issue/114 - 添加读取.bin文件权重的代码，更新readme

Merge pull request #119 from pengcheng888/issue/112
issue/114 - 添加读取.bin文件权重的代码，更新readme
78ce921e · PanZezhong1725 · GitHub · 2abef3b7 · 300470cb · 78ce921e
Unverified Commit 78ce921e authored Dec 09, 2025 by PanZezhong1725 Committed by GitHub Dec 09, 2025
6 changed files
--- a/README.md
+++ b/README.md
@@ -37,29 +37,38 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
 ```

 ## 使用方式(新版)
+#### 一、编译并安装 `InfiniCore`
+编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :

- 编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
+- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+- 根据硬件平台，选择 xmake 构建配置
+- 编译安装InfiniCore
+- 安装 C++ 库
+- 安装 Python 包

-    - 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
-    - 根据硬件平台，选择 xmake 构建配置
-    - 编译安装InfiniCore
-    - 安装 C++ 库
-    - 安装 Python 包

+#### 二、编译并安装  `InfiniLM`
+  - 克隆项目
  
+    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：

- 编译并安装 `InfiniLM` Python 包
-  - 安装第三方依赖
-  ```bash
+    ```shell
+    git clone --recursive https://github.com/InfiniTensor/InfiniCore.git
+    ```
+
+    或者在普通克隆后进行更新：
+
+    ```shell
    git submodule update --init --recursive
    ```

+
  - 安装 InfiniLM Python 包
    ```bash
      pip install -e .
    ```

- 单次推理测试
+  - 单次推理测试
    - llama示例
    ```bash
    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
@@ -68,3 +77,13 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
    ```bash
    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
    ```
+  - 分布式推理测试
+      - 9g示例
+      ```bash
+    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH 
+    ```
+    
+    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
+    ```bash
+    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16 
+    ```
\ No newline at end of file
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -30,15 +30,20 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor &
        worker->load_param(name, param);
    }
 }
-
 //------------------------------------------------------
 // state_dict
 //------------------------------------------------------
-std::unordered_map<std::string, infinicore::nn::Parameter> InferEngine::state_dict() {
+std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEngine::state_dict() {
+
+    std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> results;
    if (0 == workers_.size()) {
        throw std::runtime_error(" Model object not found. ");
    }
-    return workers_[0]->state_dict();
+
+    for (auto &worker : workers_) {
+        results.push_back(worker->state_dict());
+    }
+    return results;
 }

 //------------------------------------------------------

--- a/csrc/engine/infer_engine.hpp
+++ b/csrc/engine/infer_engine.hpp
@@ -20,7 +20,7 @@ public:
    void load_param(const std::string &name, const infinicore::Tensor &param);

    // return the parameters (i.e. weights and biases).
-    std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
+    std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> state_dict();

    // Run a single forward pass on all workers and return the outputs from all ranks
    infinicore::Tensor generate(const infinicore::Tensor &input_ids,

--- a/csrc/pybind11/engine.hpp
+++ b/csrc/pybind11/engine.hpp
@@ -44,17 +44,19 @@ inline void bind_infer_engine(py::module &m) {
             "Load a parameter tensor into all workers (each worker picks its shard)")
        .def("state_dict", [](InferEngine &self) {
            // Return a dictionary containing references to the whole state of the module.
-            auto state_dict = self.state_dict();
+            py::list state_dict_tp_all;
+            for (const auto &state_dict_tp : self.state_dict()) {
                py::dict result;
-            for (const auto &[name, param] : state_dict) {
+                for (const auto &[name, param] : state_dict_tp) {
                    result[py::cast(name)] = infinicore::Tensor(param);
                }
-            return result;
+                state_dict_tp_all.append(result);
+            }
+
+            return state_dict_tp_all;
        })
        .def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments")
-        .def("reset_cache", &InferEngine::reset_cache,
-             py::arg("pos") = 0, py::arg("async") = false,
-             "Reset the internal cache in all workers to a specific position (clears state between generations). "
+        .def("reset_cache", &InferEngine::reset_cache, py::arg("pos") = 0, py::arg("async") = false, "Reset the internal cache in all workers to a specific position (clears state between generations). "
                                                                                                     "By default, this is synchronous. If async=True, this becomes asynchronous (unstable - use with caution).");

    // Optionally, you can add __repr__ for debugging

--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -4,7 +4,7 @@ from typing import Dict, Union
 import torch
 from safetensors import safe_open
 import glob
-
+from tqdm import tqdm
 import infinicore

 str_to_torch_dtype = {
@@ -33,20 +33,22 @@ def check_parameters(model_keys: list, already_loaded_keys: list):
    error_msgs: list[str] = []

    if len(unexpected_keys) > 0:
-        error_msgs.insert(
-            0,
+        error_msgs.append(
            "Unexpected key(s) in state_dict: {}. ".format(
                ", ".join('"{}"'.format(k) for k in unexpected_keys)
-            ),
+            )
        )
    if len(missing_keys) > 0:
-        error_msgs.insert(
-            0,
+        error_msgs.append(
            "Missing key(s) in state_dict: {}. ".format(
                ", ".join('"{}"'.format(k) for k in missing_keys)
-            ),
            )
-    return error_msgs
+        )
+
+    if len(error_msgs) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+        )


 def load_state_dict(
@@ -86,6 +88,9 @@ def get_model_state_dict(
    """
    Load the model weights.
    """
+
+    print(" load weights ......")
+
    torch_device = device.type
    torch_dtype = infinicore.utils.to_torch_dtype(dtype)

@@ -117,18 +122,27 @@ def load_model_state_dict_by_file(
    dtype=infinicore.dtype,
 ) -> Dict[str, infinicore.Tensor]:
    """
-    Load the model weights by file.
+    Load the model weights from file.
    """
+    print(" load weights ......")
+
    torch_device = "cpu"
    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-    model_keys = model.state_dict().keys()
+    model_keys = model.state_dict_keyname()

    already_loaded_keys = []
-    for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
+
+    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
+    if len(file_list) > 0:
+        for file_path in tqdm(file_list, desc="Processing files"):
+            tqdm.write(f"Processing: {os.path.basename(file_path)}")
+
            # --------------------------------------------------------- #
            #          Load weights from *.safetensors file
            # --------------------------------------------------------- #
-        model_param = load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
+            model_param = load_state_dict(
+                file_path, device=torch_device, dtype=torch_dtype
+            )
            already_loaded_keys.extend(model_param.keys())

            # --------------------------------------------------------- #
@@ -141,12 +155,25 @@ def load_model_state_dict_by_file(
            model.load_state_dict(model_param_infini, strict=False)
            infinicore.sync_device()

-    error_msgs = check_parameters(model_keys, already_loaded_keys)
-    if len(error_msgs) > 0:
-        raise RuntimeError(
-            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
+        file_path = os.path.join(model_path, "pytorch_model.bin")
+        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
+
+        model_param_infini = {}
+        for key in model_params.keys():
+            model_param_infini[key] = infinicore.from_torch(
+                model_params[key].to(dtype=torch_dtype)
            )

+            already_loaded_keys.append(key)
+
+        model.load_state_dict(model_param_infini, strict=True)
+        infinicore.sync_device()
+    else:
+        raise KeyError("Weight file not found.")
+
+    check_parameters(model_keys, already_loaded_keys)
+

 def load_model_state_dict_by_tensor(
    model: infinicore.nn.Module,
@@ -156,23 +183,37 @@ def load_model_state_dict_by_tensor(
    """
    Load the model weights by tensor.
    """
+    print(" load weights ......")

    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-    model_keys = model.state_dict().keys()
+    model_keys = model.state_dict_keyname()
    already_loaded_keys = []

-    for file in glob.glob(os.path.join(model_path, "*.safetensors")):
-        with safe_open(file, "pt", "cpu") as f:
+    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
+    if len(file_list) > 0:
+        for file_path in tqdm(file_list, desc="Processing files"):
+            tqdm.write(f"Processing: {os.path.basename(file_path)}")
+
+            with safe_open(file_path, "pt", "cpu") as f:
                for name in f.keys():
-                param_infini = infinicore.from_torch(
+                    weight_infini = infinicore.from_torch(
                        f.get_tensor(name).to(dtype=torch_dtype)
                    )
-                model.load_parameter(name, param_infini)
+                    model.load_param(name, weight_infini)
                    already_loaded_keys.append(name)
                    infinicore.sync_stream()

-    error_msgs = check_parameters(model_keys, already_loaded_keys)
-    if len(error_msgs) > 0:
-        raise RuntimeError(
-            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
+        file_path = os.path.join(model_path, "pytorch_model.bin")
+        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
+
+        for key in model_params.keys():
+            weight_infini = infinicore.from_torch(
+                model_params[key].to(dtype=torch_dtype)
            )
+            model.load_param(key, weight_infini)
+            already_loaded_keys.append(key)
+    else:
+        raise KeyError("Weight file not found.")
+
+    check_parameters(model_keys, already_loaded_keys)
--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
@@ -189,12 +189,9 @@ class LlamaForCausalLM(GenerationMixin):
            config._underlying, distributed_config._underlying, device._underlying.type
        )

-    def state_dict(self):
-        """Get model state dictionary with parameter shapes"""
-        destination = OrderedDict()
-        for name, param in self._model.state_dict().items():
-            destination[name] = infinicore.Tensor(param)
-        return destination
+    def state_dict_keyname(self):
+        """Get model key name."""
+        return self._model.state_dict()[0].keys()

    def load_state_dict(self, state_dict, strict=None):
        """