Commit 1b5d1ea7 authored by pengcheng888's avatar pengcheng888
Browse files

issue/115 完善bench.py文件

parent 6498332e
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <cstddef> #include <cstddef>
#include <string> #include <string>
#include <cstdint>
namespace infinilm::cache { namespace infinilm::cache {
......
import infinicore import infinicore
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.modeling_utils import load_model_state_dict_by_file
import infinilm import infinilm
from infinilm.distributed import DistConfig from infinilm.distributed import DistConfig
...@@ -8,10 +7,121 @@ import argparse ...@@ -8,10 +7,121 @@ import argparse
import sys import sys
import time import time
import os import os
import json
from collections import OrderedDict
from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
DATA_TYPE_BYTES = {
"bfloat16": 2,
"float16": 2,
"float32": 4,
}
# BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128]
# INPUT_LENS = [32, 256, 1024, 4096]
# OUTPUT_LENS = [256, 1024, 4096]
def read_json_file(file_path):
"""Load and return JSON content from file_path."""
with open(file_path, "r") as file:
return json.load(file)
def parse_list(value: str):
"""Parse parse_list argument: can be a single int or a list of ints.
Examples:
"1" -> 1
"[1,2,4]" -> [1, 2, 4]
"1,2,4" -> [1, 2, 4]
"""
value = value.strip()
# Try to parse as JSON list first
if value.startswith("[") and value.endswith("]"):
try:
result = json.loads(value)
if isinstance(result, list):
return [int(x) for x in result]
return int(result)
except (json.JSONDecodeError, ValueError):
pass
# Try to parse as comma-separated values
if "," in value:
try:
return [int(x.strip()) for x in value.split(",")]
except ValueError:
pass
# Try to parse as a single integer
try:
return int(value)
except ValueError:
raise argparse.ArgumentTypeError(
f"batch-size must be an int or list[int], got: {value}"
)
def get_test_cases(
model_path: str,
batch_size_list: list[int],
input_len_list: list[int],
output_len_list: list[int],
):
model_path = os.path.expanduser(model_path)
"""Generate cases ordered by ascending KV cache memory usage."""
# Load model config to derive attention dimensions
config = read_json_file(os.path.join(model_path, "config.json"))
head_dim = config.get(
"head_dim", config.get("hidden_size") // config.get("num_attention_heads")
)
# KV heads and layers drive cache size
num_key_value_heads = config.get("num_key_value_heads")
num_hidden_layers = config.get("num_hidden_layers")
# Enumerate all batch/input/output combinations and compute KV cache size
case_list = []
for batch_size in batch_size_list:
for input_len in input_len_list:
for output_len in output_len_list:
for data_type in ["bfloat16"]:
data_type_bytes = DATA_TYPE_BYTES[data_type]
total_seq_len = input_len + output_len
kvcache_memory_bytes = (
data_type_bytes
* (batch_size * total_seq_len * num_key_value_heads * head_dim)
* num_hidden_layers
)
kvcache_memory_gb = kvcache_memory_bytes / (1024 * 1024 * 1024)
case_list.append(
{
"idx": len(case_list),
"batch_size": batch_size,
"input_len": input_len,
"output_len": output_len,
"data_type": data_type,
"kvcache_memory": round(kvcache_memory_gb, 3),
}
)
# Sort by KV cache size and wrap in OrderedDict with index keys
case_dict = OrderedDict(
(idx, case)
for idx, case in enumerate(
sorted(case_list, key=lambda case: case["kvcache_memory"])
)
)
return case_dict
def get_args(): def get_args():
parser = argparse.ArgumentParser(description="run Llama args") parser = argparse.ArgumentParser(description="run Llama args")
...@@ -41,9 +151,9 @@ def get_args(): ...@@ -41,9 +151,9 @@ def get_args():
parser.add_argument( parser.add_argument(
"--batch-size", "--batch-size",
type=int, type=parse_list,
default=1, default=1,
help="number of prompts in a batch", help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')",
) )
parser.add_argument( parser.add_argument(
"--tensor-parallel-size", "--tensor-parallel-size",
...@@ -54,15 +164,15 @@ def get_args(): ...@@ -54,15 +164,15 @@ def get_args():
) )
parser.add_argument( parser.add_argument(
"--input-len", "--input-len",
type=int, type=parse_list,
default=1, default=10,
help="output tokens", help="output tokens",
) )
parser.add_argument( parser.add_argument(
"--output-len", "--output-len",
type=int, type=parse_list,
default=10, default=20,
help="output tokens", help="output tokens",
) )
return parser.parse_args() return parser.parse_args()
...@@ -77,20 +187,22 @@ def repeat_prompt(input_ids: list[int], target_length: int): ...@@ -77,20 +187,22 @@ def repeat_prompt(input_ids: list[int], target_length: int):
return (input_ids * repeat_times)[:target_length] return (input_ids * repeat_times)[:target_length]
def test( class TestModel:
model: infinicore.nn.Module
tokenizer: AutoTokenizer
input_ids_list: list[int]
def __init__(
self,
model_path, model_path,
infini_dtype=infinicore.bfloat16, infini_dtype=infinicore.bfloat16,
infini_device=infinicore.device("cpu", 0), infini_device=infinicore.device("cpu", 0),
batch_size=1,
tp=1, tp=1,
input_len=10, ) -> None:
output_len=10,
):
model_path = os.path.expanduser(model_path) model_path = os.path.expanduser(model_path)
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# 创建模型, # 创建模型,
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
model = infinilm.AutoLlamaModel.from_pretrained( model = infinilm.AutoLlamaModel.from_pretrained(
model_path, model_path,
device=infini_device, device=infini_device,
...@@ -121,13 +233,20 @@ def test( ...@@ -121,13 +233,20 @@ def test(
] ]
# print(input_content, end="", flush=True) # print(input_content, end="", flush=True)
input_ids_list = tokenizer.batch_encode_plus(input_content)[ input_ids_list = tokenizer.batch_encode_plus(input_content)["input_ids"]
"input_ids"
] # List: [[1, 1128, 526, 366, 29892]]
input_ids = repeat_prompt(input_ids_list[0], target_length=input_len) self.model = model
self.tokenizer = tokenizer
self.input_ids_list = input_ids_list
def run(
self,
batch_size: int,
input_len: int,
output_len: int,
):
input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
input_ids_list = [input_ids] * batch_size input_ids_list = [input_ids] * batch_size
# print(input_ids_list)
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# 自回归生成 # 自回归生成
...@@ -136,11 +255,10 @@ def test( ...@@ -136,11 +255,10 @@ def test(
t1 = time.time() t1 = time.time()
print("=================== start generate ====================") print("=================== start generate ====================")
model.generate( self.model.generate(
input_ids_infini, input_ids_infini,
max_new_tokens=output_len, max_new_tokens=output_len,
device=infini_device, tokenizer=self.tokenizer,
tokenizer=tokenizer,
stop_on_eos=False, stop_on_eos=False,
) )
t2 = time.time() t2 = time.time()
...@@ -162,15 +280,13 @@ if __name__ == "__main__": ...@@ -162,15 +280,13 @@ if __name__ == "__main__":
device_str = "cuda" device_str = "cuda"
else: else:
print( print(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tensor-parallel-size=1 --input-len=50 --output-len=50" "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
) )
sys.exit(1) sys.exit(1)
# -------------------------------------------------------- #
# 解析参数
# -------------------------------------------------------- #
model_path = args.model model_path = args.model
batch_size = args.batch_size
tp = args.tensor_parallel_size
output_len = args.output_len
input_len = args.input_len
infini_device = infinicore.device(device_str, 0) infini_device = infinicore.device(device_str, 0)
if args.dtype == "float32": if args.dtype == "float32":
...@@ -182,12 +298,50 @@ if __name__ == "__main__": ...@@ -182,12 +298,50 @@ if __name__ == "__main__":
else: else:
raise ValueError(f"Unsupported dtype: {args.dtype}") raise ValueError(f"Unsupported dtype: {args.dtype}")
test( tp = args.tensor_parallel_size
batch_size = args.batch_size
input_len = args.input_len
output_len = args.output_len
if isinstance(batch_size, int):
batch_size = [batch_size]
if isinstance(input_len, int):
input_len = [input_len]
if isinstance(output_len, int):
output_len = [output_len]
cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)
# -------------------------------------------------------- #
# 测试
# -------------------------------------------------------- #
# print("=================== start test ====================", type(batch_size))
test = TestModel(
model_path, model_path,
infini_device=infini_device,
infini_dtype=infini_dtype, infini_dtype=infini_dtype,
batch_size=batch_size, infini_device=infini_device,
tp=tp, tp=tp,
)
for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
tqdm.write(f"\033[92mProcessing : {case}\033[0m")
batch_size = case["batch_size"]
input_len = case["input_len"]
output_len = case["output_len"]
# reset cache for each case
initial_capacity = input_len + output_len + 100
test.model.reset_cache(
batch_size=batch_size, pos=0, initial_capacity=initial_capacity
)
# run test one case
test.run(
batch_size=batch_size,
input_len=input_len, input_len=input_len,
output_len=output_len, output_len=output_len,
) )
...@@ -65,7 +65,7 @@ def get_args(): ...@@ -65,7 +65,7 @@ def get_args():
help="float32, float16, bfloat16", help="float32, float16, bfloat16",
) )
parser.add_argument( parser.add_argument(
"--batch_size", "--batch-size",
type=int, type=int,
default=1, default=1,
help="number of prompts in a batch", help="number of prompts in a batch",
...@@ -164,7 +164,6 @@ def test( ...@@ -164,7 +164,6 @@ def test(
model.generate( model.generate(
input_ids_infini, input_ids_infini,
max_new_tokens=max_new_tokens, max_new_tokens=max_new_tokens,
device=infini_device,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
t2 = time.time() t2 = time.time()
...@@ -192,8 +191,8 @@ if __name__ == "__main__": ...@@ -192,8 +191,8 @@ if __name__ == "__main__":
device_str = "cuda" device_str = "cuda"
else: else:
print( print(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n" "Usage: python examples/jiuge.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
) )
sys.exit(1) sys.exit(1)
prompts = [args.prompt for _ in range(args.batch_size)] prompts = [args.prompt for _ in range(args.batch_size)]
......
...@@ -163,7 +163,6 @@ def test( ...@@ -163,7 +163,6 @@ def test(
model.generate( model.generate(
input_ids_infini, input_ids_infini,
max_new_tokens=max_new_tokens, max_new_tokens=max_new_tokens,
device=infini_device,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
t2 = time.time() t2 = time.time()
......
...@@ -169,7 +169,6 @@ class GenerationMixin: ...@@ -169,7 +169,6 @@ class GenerationMixin:
Parameters: Parameters:
input_ids (batch_size, seq_len): The sequence used as a prompt for the generation. input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
max_new_tokens: Maximum number of new tokens. max_new_tokens: Maximum number of new tokens.
device: infinicore.device.
tokenizer: translating data into raw text. tokenizer: translating data into raw text.
""" """
......
...@@ -189,6 +189,16 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -189,6 +189,16 @@ class LlamaForCausalLM(GenerationMixin):
config._underlying, distributed_config._underlying, device._underlying.type config._underlying, distributed_config._underlying, device._underlying.type
) )
def reset_cache(self, batch_size: int, pos: int = 0, initial_capacity: int = 1024):
"""Reset the cache for the model"""
infinicore.sync_device()
cache_config = self._model.get_cache_config()
cache_config.initial_batch_size = batch_size
cache_config.initial_capacity = initial_capacity
self._model.reset_cache(cache_config, pos)
def state_dict_keyname(self): def state_dict_keyname(self):
"""Get model key name.""" """Get model key name."""
return self._model.state_dict()[0].keys() return self._model.state_dict()[0].keys()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment