"...git@developer.sourcefind.cn:tsoc/superbenchmark.git" did not exist on "dfbd70b129c7420deff2a19de28c12c3ce2d431f"
Unverified Commit 39bea30a authored by pengcheng888's avatar pengcheng888 Committed by GitHub
Browse files

Merge pull request #105 from pengcheng888/issue/102

issue/102 - 添加逐文件和逐tensor 读取权重文件的函数
parents 8c224092 7128a9a5
...@@ -30,6 +30,16 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor & ...@@ -30,6 +30,16 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor &
} }
} }
//------------------------------------------------------
// state_dict
//------------------------------------------------------
std::unordered_map<std::string, infinicore::nn::Parameter> InferEngine::state_dict() {
if (0 == workers_.size()) {
throw std::runtime_error(" Model object not found. ");
}
return workers_[0]->state_dict();
}
//------------------------------------------------------ //------------------------------------------------------
// generate // generate
//------------------------------------------------------ //------------------------------------------------------
......
...@@ -19,9 +19,12 @@ public: ...@@ -19,9 +19,12 @@ public:
// Load a parameter to all workers (each can extract its shard inside RankWorker) // Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param); void load_param(const std::string &name, const infinicore::Tensor &param);
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
// Run a single forward pass on all workers and return the outputs from all ranks // Run a single forward pass on all workers and return the outputs from all ranks
infinicore::Tensor generate(const infinicore::Tensor &input_ids, infinicore::Tensor generate(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids); const infinicore::Tensor &position_ids);
~InferEngine(); ~InferEngine();
......
...@@ -74,6 +74,13 @@ void RankWorker::load_param(const std::string &name, ...@@ -74,6 +74,13 @@ void RankWorker::load_param(const std::string &name,
} }
} }
//------------------------------------------------------
// state_dict --
//------------------------------------------------------
std::unordered_map<std::string, infinicore::nn::Parameter> RankWorker::state_dict() {
return this->model_->state_dict();
}
//------------------------------------------------------ //------------------------------------------------------
// run -- asynchronous // run -- asynchronous
//------------------------------------------------------ //------------------------------------------------------
......
...@@ -28,6 +28,9 @@ public: ...@@ -28,6 +28,9 @@ public:
void load_param(const std::string &name, void load_param(const std::string &name,
const infinicore::Tensor &param); const infinicore::Tensor &param);
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
// Submit a run (forward) job. // Submit a run (forward) job.
void run(const std::vector<std::any> &args); void run(const std::vector<std::any> &args);
......
...@@ -42,11 +42,16 @@ inline void bind_infer_engine(py::module &m) { ...@@ -42,11 +42,16 @@ inline void bind_infer_engine(py::module &m) {
.def("load_param", &InferEngine::load_param, .def("load_param", &InferEngine::load_param,
py::arg("name"), py::arg("param"), py::arg("name"), py::arg("param"),
"Load a parameter tensor into all workers (each worker picks its shard)") "Load a parameter tensor into all workers (each worker picks its shard)")
.def( .def("state_dict", [](InferEngine &self) {
"generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { // Return a dictionary containing references to the whole state of the module.
return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); auto state_dict = self.state_dict();
}, py::dict result;
"Run inference on all ranks with arbitrary arguments"); for (const auto &[name, param] : state_dict) {
result[py::cast(name)] = infinicore::Tensor(param);
}
return result;
})
.def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments");
// Optionally, you can add __repr__ for debugging // Optionally, you can add __repr__ for debugging
m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) { m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) {
......
...@@ -172,20 +172,15 @@ inline void bind_llama(py::module &m) { ...@@ -172,20 +172,15 @@ inline void bind_llama(py::module &m) {
}), }),
py::arg("config"), py::arg("device"), py::arg("dtype") = py::none()) py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
.def("state_dict", [](const LlamaForCausalLM &model) { .def("state_dict", [](const LlamaForCausalLM &model) {
// Convert state_dict to Python dict with shape information // Return a dictionary containing references to the whole state of the module.
auto state_dict = model.state_dict(); auto state_dict = model.state_dict();
py::dict result; py::dict result;
for (const auto &[name, param] : state_dict) { for (const auto &[name, param] : state_dict) {
// Parameter is a shared_ptr<Tensor>, get shape from it result[py::cast(name)] = infinicore::Tensor(param);
py::dict param_info;
param_info["shape"] = py::cast(param->shape());
param_info["dtype"] = py::cast(static_cast<int>(param->dtype()));
result[py::cast(name)] = param_info;
} }
return result; return result;
}) })
.def( .def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
"get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
// Get actual tensor parameter by name // Get actual tensor parameter by name
auto state_dict = model.state_dict(); auto state_dict = model.state_dict();
auto it = state_dict.find(name); auto it = state_dict.find(name);
...@@ -194,11 +189,8 @@ inline void bind_llama(py::module &m) { ...@@ -194,11 +189,8 @@ inline void bind_llama(py::module &m) {
const infinicore::Tensor &tensor = it->second; const infinicore::Tensor &tensor = it->second;
return tensor; return tensor;
} }
throw std::runtime_error("Parameter '" + name + "' not found in model"); throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
}, .def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
py::arg("name"))
.def(
"load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
// Convert Python dict to C++ state_dict // Convert Python dict to C++ state_dict
std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict; std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
for (auto item : state_dict) { for (auto item : state_dict) {
...@@ -206,12 +198,9 @@ inline void bind_llama(py::module &m) { ...@@ -206,12 +198,9 @@ inline void bind_llama(py::module &m) {
py::object value = item.second.cast<py::object>(); py::object value = item.second.cast<py::object>();
cpp_state_dict.emplace(key, convert_to_tensor(value, device)); cpp_state_dict.emplace(key, convert_to_tensor(value, device));
} }
model.load_state_dict(cpp_state_dict); model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"), py::arg("device"))
},
py::arg("state_dict"), py::arg("device"))
.def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal) .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
.def( .def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
"forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
// Helper to extract C++ tensor from Python object // Helper to extract C++ tensor from Python object
auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor { auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
// If it's already a Python InfiniCore tensor wrapper, extract underlying // If it's already a Python InfiniCore tensor wrapper, extract underlying
...@@ -247,9 +236,9 @@ inline void bind_llama(py::module &m) { ...@@ -247,9 +236,9 @@ inline void bind_llama(py::module &m) {
// Handle kv_caches if provided // Handle kv_caches if provided
std::vector<void *> *kv_caches_ptr = nullptr; std::vector<void *> *kv_caches_ptr = nullptr;
return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); },
}, //
py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none()); py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
} }
} // namespace infinilm::models::llama } // namespace infinilm::models::llama
import infinicore
from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import load_model_state_dict_by_file
import infinilm
from infinilm.distributed import DistConfig
import argparse
import sys
import time
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
def get_args():
parser = argparse.ArgumentParser(description="run Llama args")
parser.add_argument(
"--cpu",
action="store_true",
help="Run cpu test",
)
parser.add_argument(
"--nvidia",
action="store_true",
help="Run nvidia test",
)
parser.add_argument(
"--metax",
action="store_true",
help="Run metax test",
)
parser.add_argument(
"--moore",
action="store_true",
help="Run moore test",
)
parser.add_argument(
"--iluvatar",
action="store_true",
help="Run iluvatar test",
)
parser.add_argument(
"--model_path",
type=str,
required=True,
help="model_path",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=100,
help="max_new_tokens",
)
parser.add_argument(
"--backend",
type=str,
default="python",
help="python or cpp model",
)
parser.add_argument(
"--dtype",
type=str,
default="bfloat16",
help="float32, float16, bfloat16",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="number of prompts in a batch",
)
parser.add_argument(
"--prompt",
type=str,
default="How are you",
help="input prompt",
)
parser.add_argument(
"--tp",
type=int,
default=None,
help="total rank for tensor parallel",
)
return parser.parse_args()
def test(
prompts: str | list[str],
model_path,
max_new_tokens=100,
infini_dtype=infinicore.bfloat16,
infini_device=infinicore.device("cpu", 0),
backend="python",
):
model_path = os.path.expanduser(model_path)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
model = infinilm.AutoLlamaModel.from_pretrained(
model_path,
device=infini_device,
dtype=infini_dtype,
backend=backend,
distributed_config=DistConfig(args.tp),
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file(model, model_path, dtype=infini_dtype)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if "llama" == model.config.model_type:
backend = getattr(tokenizer, "backend_tokenizer", None)
target = getattr(backend, "_tokenizer", backend)
norm = getattr(target, "normalizer", None)
dec = getattr(target, "decoder", None)
sn = repr(norm)[:800] if norm is not None else ""
sd = repr(dec)[:800] if dec is not None else ""
has_prepend = "Prepend" in sn
has_strip = "Strip" in sd
if has_prepend and has_strip:
target.decoder = _dec.Sequence(
[
_dec.Replace("▁", " "),
_dec.ByteFallback(),
_dec.Fuse(),
]
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
# prompt = "山东最高的山是?"
if isinstance(prompts, str):
prompts = [prompts]
input_contents = [
tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
for prompt in prompts
]
print(input_contents[0], end="", flush=True)
input_ids_list = tokenizer.batch_encode_plus(input_contents)[
"input_ids"
] # List: [[1, 1128, 526, 366, 29892]]
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini = infinicore.from_list(input_ids_list)
t1 = time.time()
print("=================== start generate ====================")
model.generate(
input_ids_infini,
max_new_tokens=max_new_tokens,
device=infini_device,
tokenizer=tokenizer,
)
t2 = time.time()
print(
f"total_time: {round((t2 - t1) * 1000, 2)} ms",
)
if __name__ == "__main__":
args = get_args()
print(args)
# Parse command line arguments
device_str = "cpu"
if args.cpu:
device_str = "cpu"
elif args.nvidia:
device_str = "cuda"
elif args.metax:
device_str = "cuda"
elif args.moore:
device_str = "musa"
elif args.iluvatar:
device_str = "cuda"
else:
print(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys.exit(1)
prompts = [args.prompt for _ in range(args.batch_size)]
model_path = args.model_path
max_new_tokens = args.max_new_tokens
backend = args.backend
infini_device = infinicore.device(device_str, 0)
if args.dtype == "float32":
infini_dtype = infinicore.float32
elif args.dtype == "bfloat16":
infini_dtype = infinicore.bfloat16
elif args.dtype == "float16":
infini_dtype = infinicore.float16
else:
raise ValueError(f"Unsupported dtype: {args.dtype}")
test(
prompts,
model_path,
max_new_tokens,
infini_device=infini_device,
infini_dtype=infini_dtype,
backend=backend,
)
...@@ -3,7 +3,6 @@ from transformers import AutoTokenizer ...@@ -3,7 +3,6 @@ from transformers import AutoTokenizer
from tokenizers import decoders as _dec from tokenizers import decoders as _dec
from infinilm.modeling_utils import get_model_state_dict from infinilm.modeling_utils import get_model_state_dict
import infinilm import infinilm
from infinilm.distributed import DistConfig
import argparse import argparse
import sys import sys
import time import time
...@@ -76,12 +75,6 @@ def get_args(): ...@@ -76,12 +75,6 @@ def get_args():
default="How are you", default="How are you",
help="input prompt", help="input prompt",
) )
parser.add_argument(
"--tp",
type=int,
default=None,
help="total rank for tensor parallel",
)
return parser.parse_args() return parser.parse_args()
...@@ -103,7 +96,6 @@ def test( ...@@ -103,7 +96,6 @@ def test(
device=infini_device, device=infini_device,
dtype=infini_dtype, dtype=infini_dtype,
backend=backend, backend=backend,
distributed_config=DistConfig(args.tp),
) )
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
...@@ -115,7 +107,7 @@ def test( ...@@ -115,7 +107,7 @@ def test(
dtype=infini_dtype, dtype=infini_dtype,
) )
model.load_state_dict(model_param_infini) model.load_state_dict(model_param_infini, strict=True)
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# 创建 tokenizer # 创建 tokenizer
......
import os import os
from typing import Dict, Optional, Union from typing import Dict, Union
import torch import torch
from safetensors import safe_open from safetensors import safe_open
...@@ -23,15 +23,39 @@ str_to_torch_dtype = { ...@@ -23,15 +23,39 @@ str_to_torch_dtype = {
} }
def check_parameters(model_keys: list, already_loaded_keys: list):
model_keys = set(model_keys)
already_loaded_keys = set(already_loaded_keys)
intersection = model_keys & already_loaded_keys
missing_keys = model_keys - intersection
unexpected_keys = already_loaded_keys - intersection
error_msgs: list[str] = []
if len(unexpected_keys) > 0:
error_msgs.insert(
0,
"Unexpected key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in unexpected_keys)
),
)
if len(missing_keys) > 0:
error_msgs.insert(
0,
"Missing key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in missing_keys)
),
)
return error_msgs
def load_state_dict( def load_state_dict(
checkpoint_file: Union[str, os.PathLike], checkpoint_file: Union[str, os.PathLike], device="cpu", dtype=torch.bfloat16
map_location: Optional[Union[str, torch.device]] = "cpu",
weights_only: bool = True,
) -> Dict[str, torch.Tensor]: ) -> Dict[str, torch.Tensor]:
""" """
Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default. Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
""" """
# Use safetensors if possible
if not checkpoint_file.endswith(".safetensors"): if not checkpoint_file.endswith(".safetensors"):
return {} return {}
...@@ -49,20 +73,7 @@ def load_state_dict( ...@@ -49,20 +73,7 @@ def load_state_dict(
) )
for k in f.keys(): for k in f.keys():
if map_location == "meta": state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
_slice = f.get_slice(k)
k_dtype = _slice.get_dtype()
if k_dtype in str_to_torch_dtype:
dtype = str_to_torch_dtype[k_dtype]
else:
raise ValueError(
f"Cannot load safetensors of unknown dtype {k_dtype}"
)
state_dict[k] = torch.empty(
size=_slice.get_shape(), dtype=dtype, device="meta"
)
else:
state_dict[k] = f.get_tensor(k)
return state_dict return state_dict
...@@ -75,30 +86,93 @@ def get_model_state_dict( ...@@ -75,30 +86,93 @@ def get_model_state_dict(
""" """
Load the model weights. Load the model weights.
""" """
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
# --------------------------------------------------------- # # --------------------------------------------------------- #
# 使用从 *.safetensors文件中加载权重 # Load weights from all *.safetensors files
# --------------------------------------------------------- # # --------------------------------------------------------- #
model_param = {} model_param = {}
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")): for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
model_param.update(load_state_dict(file_path)) model_param.update(
load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
)
if model_param.get("lm_head.weight", None) is None: if model_param.get("lm_head.weight", None) is None:
model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"] model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
# --------------------------------------------------------- # # --------------------------------------------------------- #
# 调整权重的device和dtype # model_param_infini references torch.Tensor
# --------------------------------------------------------- # # --------------------------------------------------------- #
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_param_infini = {} model_param_infini = {}
for key, value in model_param.items(): for key in model_param.keys():
model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
# --------------------------------------------------------- #
# model_param_infini 引用torch.Tensor
# --------------------------------------------------------- #
for key, value in model_param.items():
model_param_infini[key] = infinicore.from_torch(model_param[key]) model_param_infini[key] = infinicore.from_torch(model_param[key])
return model_param_infini return model_param_infini
def load_model_state_dict_by_file(
model: infinicore.nn.Module,
model_path: str,
dtype=infinicore.dtype,
) -> Dict[str, infinicore.Tensor]:
"""
Load the model weights by file.
"""
torch_device = "cpu"
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
already_loaded_keys = []
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
# --------------------------------------------------------- #
# Load weights from *.safetensors file
# --------------------------------------------------------- #
model_param = load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
already_loaded_keys.extend(model_param.keys())
# --------------------------------------------------------- #
# model_param_infini references torch.Tensor
# --------------------------------------------------------- #
model_param_infini = {}
for key in model_param.keys():
model_param_infini[key] = infinicore.from_torch(model_param[key])
model.load_state_dict(model_param_infini, strict=False)
infinicore.sync_device()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
)
def load_model_state_dict_by_tensor(
model: infinicore.nn.Module,
model_path: str,
dtype=infinicore.dtype,
):
"""
Load the model weights by tensor.
"""
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
already_loaded_keys = []
for file in glob.glob(os.path.join(model_path, "*.safetensors")):
with safe_open(file, "pt", "cpu") as f:
for name in f.keys():
param_infini = infinicore.from_torch(
f.get_tensor(name).to(dtype=torch_dtype)
)
model.load_parameter(name, param_infini)
already_loaded_keys.append(name)
infinicore.sync_stream()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
)
...@@ -18,6 +18,10 @@ class AutoLlamaModel: ...@@ -18,6 +18,10 @@ class AutoLlamaModel:
if backend == "python": if backend == "python":
from . import modeling_llama from . import modeling_llama
print("\n***************************************************************")
print("\t\t Loading Llama Model with Python Backend")
print(f"\t\t Device: {device}, DType: {dtype}")
print("***************************************************************\n")
return modeling_llama.LlamaForCausalLM.from_pretrained( return modeling_llama.LlamaForCausalLM.from_pretrained(
model_path, model_path,
device=device, device=device,
...@@ -28,6 +32,10 @@ class AutoLlamaModel: ...@@ -28,6 +32,10 @@ class AutoLlamaModel:
elif backend == "cpp": elif backend == "cpp":
from .backends import cpp from .backends import cpp
print("\n***************************************************************")
print("\t\tLoading Llama Model with C++ Backend")
print(f"\t\tDevice: {device}, DType: {dtype}")
print("***************************************************************\n")
return cpp.LlamaForCausalLM.from_pretrained( return cpp.LlamaForCausalLM.from_pretrained(
model_path, model_path,
device=device, device=device,
......
...@@ -6,6 +6,7 @@ from infinilm.distributed import DistConfig ...@@ -6,6 +6,7 @@ from infinilm.distributed import DistConfig
import json import json
import os import os
from typing import Optional, Union from typing import Optional, Union
from collections import OrderedDict
class LlamaConfig: class LlamaConfig:
...@@ -120,9 +121,12 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -120,9 +121,12 @@ class LlamaForCausalLM(GenerationMixin):
def state_dict(self): def state_dict(self):
"""Get model state dictionary with parameter shapes""" """Get model state dictionary with parameter shapes"""
return self._model.state_dict() destination = OrderedDict()
for name, param in self._model.state_dict().items():
destination[name] = infinicore.Tensor(param)
return destination
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict, strict=None):
""" """
Load state dictionary into the model Load state dictionary into the model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment