Unverified Commit 78ce921e authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #119 from pengcheng888/issue/112

issue/114 - 添加读取.bin文件权重的代码,更新readme
parents 2abef3b7 300470cb
......@@ -37,29 +37,38 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
```
## 使用方式(新版)
#### 一、编译并安装 `InfiniCore`
编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
- 编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`
- 根据硬件平台,选择 xmake 构建配置
- 编译安装InfiniCore
- 安装 C++ 库
- 安装 Python 包
- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`)
- 根据硬件平台,选择 xmake 构建配置
- 编译安装InfiniCore
- 安装 C++ 库
- 安装 Python 包
#### 二、编译并安装 `InfiniLM`
- 克隆项目
由于仓库中含有子模块,所以在克隆时请添加 `--recursive``--recurse-submodules`,如:
- 编译并安装 `InfiniLM` Python 包
- 安装第三方依赖
```bash
```shell
git clone --recursive https://github.com/InfiniTensor/InfiniCore.git
```
或者在普通克隆后进行更新:
```shell
git submodule update --init --recursive
```
- 安装 InfiniLM Python 包
```bash
pip install -e .
```
- 单次推理测试
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
......@@ -68,3 +77,13 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
```bash
python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
```
- 分布式推理测试
- 9g示例
```bash
python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
```
- 例如: 9G7B模型,cpp后端,batch_size为16,4卡分布式
```bash
python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
```
\ No newline at end of file
......@@ -30,15 +30,20 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor &
worker->load_param(name, param);
}
}
//------------------------------------------------------
// state_dict
//------------------------------------------------------
std::unordered_map<std::string, infinicore::nn::Parameter> InferEngine::state_dict() {
std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEngine::state_dict() {
std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> results;
if (0 == workers_.size()) {
throw std::runtime_error(" Model object not found. ");
}
return workers_[0]->state_dict();
for (auto &worker : workers_) {
results.push_back(worker->state_dict());
}
return results;
}
//------------------------------------------------------
......
......@@ -20,7 +20,7 @@ public:
void load_param(const std::string &name, const infinicore::Tensor &param);
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> state_dict();
// Run a single forward pass on all workers and return the outputs from all ranks
infinicore::Tensor generate(const infinicore::Tensor &input_ids,
......
......@@ -44,17 +44,19 @@ inline void bind_infer_engine(py::module &m) {
"Load a parameter tensor into all workers (each worker picks its shard)")
.def("state_dict", [](InferEngine &self) {
// Return a dictionary containing references to the whole state of the module.
auto state_dict = self.state_dict();
py::list state_dict_tp_all;
for (const auto &state_dict_tp : self.state_dict()) {
py::dict result;
for (const auto &[name, param] : state_dict) {
for (const auto &[name, param] : state_dict_tp) {
result[py::cast(name)] = infinicore::Tensor(param);
}
return result;
state_dict_tp_all.append(result);
}
return state_dict_tp_all;
})
.def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments")
.def("reset_cache", &InferEngine::reset_cache,
py::arg("pos") = 0, py::arg("async") = false,
"Reset the internal cache in all workers to a specific position (clears state between generations). "
.def("reset_cache", &InferEngine::reset_cache, py::arg("pos") = 0, py::arg("async") = false, "Reset the internal cache in all workers to a specific position (clears state between generations). "
"By default, this is synchronous. If async=True, this becomes asynchronous (unstable - use with caution).");
// Optionally, you can add __repr__ for debugging
......
......@@ -4,7 +4,7 @@ from typing import Dict, Union
import torch
from safetensors import safe_open
import glob
from tqdm import tqdm
import infinicore
str_to_torch_dtype = {
......@@ -33,20 +33,22 @@ def check_parameters(model_keys: list, already_loaded_keys: list):
error_msgs: list[str] = []
if len(unexpected_keys) > 0:
error_msgs.insert(
0,
error_msgs.append(
"Unexpected key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in unexpected_keys)
),
)
)
if len(missing_keys) > 0:
error_msgs.insert(
0,
error_msgs.append(
"Missing key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in missing_keys)
),
)
return error_msgs
)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
)
def load_state_dict(
......@@ -86,6 +88,9 @@ def get_model_state_dict(
"""
Load the model weights.
"""
print(" load weights ......")
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
......@@ -117,18 +122,27 @@ def load_model_state_dict_by_file(
dtype=infinicore.dtype,
) -> Dict[str, infinicore.Tensor]:
"""
Load the model weights by file.
Load the model weights from file.
"""
print(" load weights ......")
torch_device = "cpu"
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
model_keys = model.state_dict_keyname()
already_loaded_keys = []
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
if len(file_list) > 0:
for file_path in tqdm(file_list, desc="Processing files"):
tqdm.write(f"Processing: {os.path.basename(file_path)}")
# --------------------------------------------------------- #
# Load weights from *.safetensors file
# --------------------------------------------------------- #
model_param = load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
model_param = load_state_dict(
file_path, device=torch_device, dtype=torch_dtype
)
already_loaded_keys.extend(model_param.keys())
# --------------------------------------------------------- #
......@@ -141,12 +155,25 @@ def load_model_state_dict_by_file(
model.load_state_dict(model_param_infini, strict=False)
infinicore.sync_device()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
file_path = os.path.join(model_path, "pytorch_model.bin")
model_params = torch.load(file_path, weights_only=True, map_location="cpu")
model_param_infini = {}
for key in model_params.keys():
model_param_infini[key] = infinicore.from_torch(
model_params[key].to(dtype=torch_dtype)
)
already_loaded_keys.append(key)
model.load_state_dict(model_param_infini, strict=True)
infinicore.sync_device()
else:
raise KeyError("Weight file not found.")
check_parameters(model_keys, already_loaded_keys)
def load_model_state_dict_by_tensor(
model: infinicore.nn.Module,
......@@ -156,23 +183,37 @@ def load_model_state_dict_by_tensor(
"""
Load the model weights by tensor.
"""
print(" load weights ......")
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
model_keys = model.state_dict_keyname()
already_loaded_keys = []
for file in glob.glob(os.path.join(model_path, "*.safetensors")):
with safe_open(file, "pt", "cpu") as f:
file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
if len(file_list) > 0:
for file_path in tqdm(file_list, desc="Processing files"):
tqdm.write(f"Processing: {os.path.basename(file_path)}")
with safe_open(file_path, "pt", "cpu") as f:
for name in f.keys():
param_infini = infinicore.from_torch(
weight_infini = infinicore.from_torch(
f.get_tensor(name).to(dtype=torch_dtype)
)
model.load_parameter(name, param_infini)
model.load_param(name, weight_infini)
already_loaded_keys.append(name)
infinicore.sync_stream()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
file_path = os.path.join(model_path, "pytorch_model.bin")
model_params = torch.load(file_path, weights_only=True, map_location="cpu")
for key in model_params.keys():
weight_infini = infinicore.from_torch(
model_params[key].to(dtype=torch_dtype)
)
model.load_param(key, weight_infini)
already_loaded_keys.append(key)
else:
raise KeyError("Weight file not found.")
check_parameters(model_keys, already_loaded_keys)
......@@ -189,12 +189,9 @@ class LlamaForCausalLM(GenerationMixin):
config._underlying, distributed_config._underlying, device._underlying.type
)
def state_dict(self):
"""Get model state dictionary with parameter shapes"""
destination = OrderedDict()
for name, param in self._model.state_dict().items():
destination[name] = infinicore.Tensor(param)
return destination
def state_dict_keyname(self):
"""Get model key name."""
return self._model.state_dict()[0].keys()
def load_state_dict(self, state_dict, strict=None):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment