Unverified Commit 39bea30a authored by pengcheng888's avatar pengcheng888 Committed by GitHub
Browse files

Merge pull request #105 from pengcheng888/issue/102

issue/102 - 添加逐文件和逐tensor 读取权重文件的函数
parents 8c224092 7128a9a5
......@@ -30,6 +30,16 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor &
}
}
//------------------------------------------------------
// state_dict
//------------------------------------------------------
std::unordered_map<std::string, infinicore::nn::Parameter> InferEngine::state_dict() {
if (0 == workers_.size()) {
throw std::runtime_error(" Model object not found. ");
}
return workers_[0]->state_dict();
}
//------------------------------------------------------
// generate
//------------------------------------------------------
......
......@@ -19,6 +19,9 @@ public:
// Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param);
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
// Run a single forward pass on all workers and return the outputs from all ranks
infinicore::Tensor generate(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids);
......
......@@ -74,6 +74,13 @@ void RankWorker::load_param(const std::string &name,
}
}
//------------------------------------------------------
// state_dict --
//------------------------------------------------------
std::unordered_map<std::string, infinicore::nn::Parameter> RankWorker::state_dict() {
return this->model_->state_dict();
}
//------------------------------------------------------
// run -- asynchronous
//------------------------------------------------------
......
......@@ -28,6 +28,9 @@ public:
void load_param(const std::string &name,
const infinicore::Tensor &param);
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();
// Submit a run (forward) job.
void run(const std::vector<std::any> &args);
......
......@@ -42,11 +42,16 @@ inline void bind_infer_engine(py::module &m) {
.def("load_param", &InferEngine::load_param,
py::arg("name"), py::arg("param"),
"Load a parameter tensor into all workers (each worker picks its shard)")
.def(
"generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor {
return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>());
},
"Run inference on all ranks with arbitrary arguments");
.def("state_dict", [](InferEngine &self) {
// Return a dictionary containing references to the whole state of the module.
auto state_dict = self.state_dict();
py::dict result;
for (const auto &[name, param] : state_dict) {
result[py::cast(name)] = infinicore::Tensor(param);
}
return result;
})
.def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments");
// Optionally, you can add __repr__ for debugging
m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) {
......
......@@ -172,20 +172,15 @@ inline void bind_llama(py::module &m) {
}),
py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
.def("state_dict", [](const LlamaForCausalLM &model) {
// Convert state_dict to Python dict with shape information
// Return a dictionary containing references to the whole state of the module.
auto state_dict = model.state_dict();
py::dict result;
for (const auto &[name, param] : state_dict) {
// Parameter is a shared_ptr<Tensor>, get shape from it
py::dict param_info;
param_info["shape"] = py::cast(param->shape());
param_info["dtype"] = py::cast(static_cast<int>(param->dtype()));
result[py::cast(name)] = param_info;
result[py::cast(name)] = infinicore::Tensor(param);
}
return result;
})
.def(
"get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
.def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
// Get actual tensor parameter by name
auto state_dict = model.state_dict();
auto it = state_dict.find(name);
......@@ -194,11 +189,8 @@ inline void bind_llama(py::module &m) {
const infinicore::Tensor &tensor = it->second;
return tensor;
}
throw std::runtime_error("Parameter '" + name + "' not found in model");
},
py::arg("name"))
.def(
"load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
.def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
// Convert Python dict to C++ state_dict
std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
for (auto item : state_dict) {
......@@ -206,12 +198,9 @@ inline void bind_llama(py::module &m) {
py::object value = item.second.cast<py::object>();
cpp_state_dict.emplace(key, convert_to_tensor(value, device));
}
model.load_state_dict(cpp_state_dict);
},
py::arg("state_dict"), py::arg("device"))
model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"), py::arg("device"))
.def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
.def(
"forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
.def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
// Helper to extract C++ tensor from Python object
auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
// If it's already a Python InfiniCore tensor wrapper, extract underlying
......@@ -247,8 +236,8 @@ inline void bind_llama(py::module &m) {
// Handle kv_caches if provided
std::vector<void *> *kv_caches_ptr = nullptr;
return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr);
},
return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); },
//
py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
}
......
import infinicore
from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import load_model_state_dict_by_file
import infinilm
from infinilm.distributed import DistConfig
import argparse
import sys
import time
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
def get_args():
parser = argparse.ArgumentParser(description="run Llama args")
parser.add_argument(
"--cpu",
action="store_true",
help="Run cpu test",
)
parser.add_argument(
"--nvidia",
action="store_true",
help="Run nvidia test",
)
parser.add_argument(
"--metax",
action="store_true",
help="Run metax test",
)
parser.add_argument(
"--moore",
action="store_true",
help="Run moore test",
)
parser.add_argument(
"--iluvatar",
action="store_true",
help="Run iluvatar test",
)
parser.add_argument(
"--model_path",
type=str,
required=True,
help="model_path",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=100,
help="max_new_tokens",
)
parser.add_argument(
"--backend",
type=str,
default="python",
help="python or cpp model",
)
parser.add_argument(
"--dtype",
type=str,
default="bfloat16",
help="float32, float16, bfloat16",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="number of prompts in a batch",
)
parser.add_argument(
"--prompt",
type=str,
default="How are you",
help="input prompt",
)
parser.add_argument(
"--tp",
type=int,
default=None,
help="total rank for tensor parallel",
)
return parser.parse_args()
def test(
prompts: str | list[str],
model_path,
max_new_tokens=100,
infini_dtype=infinicore.bfloat16,
infini_device=infinicore.device("cpu", 0),
backend="python",
):
model_path = os.path.expanduser(model_path)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
model = infinilm.AutoLlamaModel.from_pretrained(
model_path,
device=infini_device,
dtype=infini_dtype,
backend=backend,
distributed_config=DistConfig(args.tp),
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file(model, model_path, dtype=infini_dtype)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if "llama" == model.config.model_type:
backend = getattr(tokenizer, "backend_tokenizer", None)
target = getattr(backend, "_tokenizer", backend)
norm = getattr(target, "normalizer", None)
dec = getattr(target, "decoder", None)
sn = repr(norm)[:800] if norm is not None else ""
sd = repr(dec)[:800] if dec is not None else ""
has_prepend = "Prepend" in sn
has_strip = "Strip" in sd
if has_prepend and has_strip:
target.decoder = _dec.Sequence(
[
_dec.Replace("▁", " "),
_dec.ByteFallback(),
_dec.Fuse(),
]
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
# prompt = "山东最高的山是?"
if isinstance(prompts, str):
prompts = [prompts]
input_contents = [
tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
for prompt in prompts
]
print(input_contents[0], end="", flush=True)
input_ids_list = tokenizer.batch_encode_plus(input_contents)[
"input_ids"
] # List: [[1, 1128, 526, 366, 29892]]
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini = infinicore.from_list(input_ids_list)
t1 = time.time()
print("=================== start generate ====================")
model.generate(
input_ids_infini,
max_new_tokens=max_new_tokens,
device=infini_device,
tokenizer=tokenizer,
)
t2 = time.time()
print(
f"total_time: {round((t2 - t1) * 1000, 2)} ms",
)
if __name__ == "__main__":
args = get_args()
print(args)
# Parse command line arguments
device_str = "cpu"
if args.cpu:
device_str = "cpu"
elif args.nvidia:
device_str = "cuda"
elif args.metax:
device_str = "cuda"
elif args.moore:
device_str = "musa"
elif args.iluvatar:
device_str = "cuda"
else:
print(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys.exit(1)
prompts = [args.prompt for _ in range(args.batch_size)]
model_path = args.model_path
max_new_tokens = args.max_new_tokens
backend = args.backend
infini_device = infinicore.device(device_str, 0)
if args.dtype == "float32":
infini_dtype = infinicore.float32
elif args.dtype == "bfloat16":
infini_dtype = infinicore.bfloat16
elif args.dtype == "float16":
infini_dtype = infinicore.float16
else:
raise ValueError(f"Unsupported dtype: {args.dtype}")
test(
prompts,
model_path,
max_new_tokens,
infini_device=infini_device,
infini_dtype=infini_dtype,
backend=backend,
)
......@@ -3,7 +3,6 @@ from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import get_model_state_dict
import infinilm
from infinilm.distributed import DistConfig
import argparse
import sys
import time
......@@ -76,12 +75,6 @@ def get_args():
default="How are you",
help="input prompt",
)
parser.add_argument(
"--tp",
type=int,
default=None,
help="total rank for tensor parallel",
)
return parser.parse_args()
......@@ -103,7 +96,6 @@ def test(
device=infini_device,
dtype=infini_dtype,
backend=backend,
distributed_config=DistConfig(args.tp),
)
# ---------------------------------------------------------------------------- #
......@@ -115,7 +107,7 @@ def test(
dtype=infini_dtype,
)
model.load_state_dict(model_param_infini)
model.load_state_dict(model_param_infini, strict=True)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
......
import os
from typing import Dict, Optional, Union
from typing import Dict, Union
import torch
from safetensors import safe_open
......@@ -23,15 +23,39 @@ str_to_torch_dtype = {
}
def check_parameters(model_keys: list, already_loaded_keys: list):
model_keys = set(model_keys)
already_loaded_keys = set(already_loaded_keys)
intersection = model_keys & already_loaded_keys
missing_keys = model_keys - intersection
unexpected_keys = already_loaded_keys - intersection
error_msgs: list[str] = []
if len(unexpected_keys) > 0:
error_msgs.insert(
0,
"Unexpected key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in unexpected_keys)
),
)
if len(missing_keys) > 0:
error_msgs.insert(
0,
"Missing key(s) in state_dict: {}. ".format(
", ".join('"{}"'.format(k) for k in missing_keys)
),
)
return error_msgs
def load_state_dict(
checkpoint_file: Union[str, os.PathLike],
map_location: Optional[Union[str, torch.device]] = "cpu",
weights_only: bool = True,
checkpoint_file: Union[str, os.PathLike], device="cpu", dtype=torch.bfloat16
) -> Dict[str, torch.Tensor]:
"""
Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
"""
# Use safetensors if possible
if not checkpoint_file.endswith(".safetensors"):
return {}
......@@ -49,20 +73,7 @@ def load_state_dict(
)
for k in f.keys():
if map_location == "meta":
_slice = f.get_slice(k)
k_dtype = _slice.get_dtype()
if k_dtype in str_to_torch_dtype:
dtype = str_to_torch_dtype[k_dtype]
else:
raise ValueError(
f"Cannot load safetensors of unknown dtype {k_dtype}"
)
state_dict[k] = torch.empty(
size=_slice.get_shape(), dtype=dtype, device="meta"
)
else:
state_dict[k] = f.get_tensor(k)
state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
return state_dict
......@@ -75,30 +86,93 @@ def get_model_state_dict(
"""
Load the model weights.
"""
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
# --------------------------------------------------------- #
# 使用从 *.safetensors文件中加载权重
# Load weights from all *.safetensors files
# --------------------------------------------------------- #
model_param = {}
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
model_param.update(load_state_dict(file_path))
model_param.update(
load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
)
if model_param.get("lm_head.weight", None) is None:
model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
# --------------------------------------------------------- #
# 调整权重的device和dtype
# model_param_infini references torch.Tensor
# --------------------------------------------------------- #
torch_device = device.type
model_param_infini = {}
for key in model_param.keys():
model_param_infini[key] = infinicore.from_torch(model_param[key])
return model_param_infini
def load_model_state_dict_by_file(
model: infinicore.nn.Module,
model_path: str,
dtype=infinicore.dtype,
) -> Dict[str, infinicore.Tensor]:
"""
Load the model weights by file.
"""
torch_device = "cpu"
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
model_param_infini = {}
for key, value in model_param.items():
model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
already_loaded_keys = []
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
# --------------------------------------------------------- #
# Load weights from *.safetensors file
# --------------------------------------------------------- #
model_param = load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
already_loaded_keys.extend(model_param.keys())
# --------------------------------------------------------- #
# model_param_infini 引用torch.Tensor
# model_param_infini references torch.Tensor
# --------------------------------------------------------- #
for key, value in model_param.items():
model_param_infini = {}
for key in model_param.keys():
model_param_infini[key] = infinicore.from_torch(model_param[key])
return model_param_infini
model.load_state_dict(model_param_infini, strict=False)
infinicore.sync_device()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
)
def load_model_state_dict_by_tensor(
model: infinicore.nn.Module,
model_path: str,
dtype=infinicore.dtype,
):
"""
Load the model weights by tensor.
"""
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_keys = model.state_dict().keys()
already_loaded_keys = []
for file in glob.glob(os.path.join(model_path, "*.safetensors")):
with safe_open(file, "pt", "cpu") as f:
for name in f.keys():
param_infini = infinicore.from_torch(
f.get_tensor(name).to(dtype=torch_dtype)
)
model.load_parameter(name, param_infini)
already_loaded_keys.append(name)
infinicore.sync_stream()
error_msgs = check_parameters(model_keys, already_loaded_keys)
if len(error_msgs) > 0:
raise RuntimeError(
"Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
)
......@@ -18,6 +18,10 @@ class AutoLlamaModel:
if backend == "python":
from . import modeling_llama
print("\n***************************************************************")
print("\t\t Loading Llama Model with Python Backend")
print(f"\t\t Device: {device}, DType: {dtype}")
print("***************************************************************\n")
return modeling_llama.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
......@@ -28,6 +32,10 @@ class AutoLlamaModel:
elif backend == "cpp":
from .backends import cpp
print("\n***************************************************************")
print("\t\tLoading Llama Model with C++ Backend")
print(f"\t\tDevice: {device}, DType: {dtype}")
print("***************************************************************\n")
return cpp.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
......
......@@ -6,6 +6,7 @@ from infinilm.distributed import DistConfig
import json
import os
from typing import Optional, Union
from collections import OrderedDict
class LlamaConfig:
......@@ -120,9 +121,12 @@ class LlamaForCausalLM(GenerationMixin):
def state_dict(self):
"""Get model state dictionary with parameter shapes"""
return self._model.state_dict()
destination = OrderedDict()
for name, param in self._model.state_dict().items():
destination[name] = infinicore.Tensor(param)
return destination
def load_state_dict(self, state_dict):
def load_state_dict(self, state_dict, strict=None):
"""
Load state dictionary into the model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment