Unverified Commit 6ae48322 authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge pull request #242 from InfiniTensor/issue/241

issue/241 fix mmlu test, add vllm support
parents e76bb324 0086ff2f
...@@ -4,11 +4,6 @@ import time ...@@ -4,11 +4,6 @@ import time
import re import re
import csv import csv
import numpy as np import numpy as np
import infinicore
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from datasets import load_dataset, Dataset from datasets import load_dataset, Dataset
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
...@@ -57,6 +52,11 @@ class InfiniLMBenchmark(BaseBenchmark): ...@@ -57,6 +52,11 @@ class InfiniLMBenchmark(BaseBenchmark):
enable_paged_attn=False, enable_paged_attn=False,
): ):
import transformers import transformers
import infinicore
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from infinilm.infer_engine import InferEngine
self.benchmark = benchmark self.benchmark = benchmark
...@@ -103,7 +103,9 @@ class InfiniLMBenchmark(BaseBenchmark): ...@@ -103,7 +103,9 @@ class InfiniLMBenchmark(BaseBenchmark):
) )
elif model_type in ["qwen2", "qwen3"]: elif model_type in ["qwen2", "qwen3"]:
# For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536) # For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path) self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)
else: else:
# Default: use trust_remote_code=True for other models # Default: use trust_remote_code=True for other models
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
...@@ -179,6 +181,9 @@ class InfiniLMBenchmark(BaseBenchmark): ...@@ -179,6 +181,9 @@ class InfiniLMBenchmark(BaseBenchmark):
which properly handles KV cache through GenerationMixin. which properly handles KV cache through GenerationMixin.
""" """
# Convert tokens to infinicore format # Convert tokens to infinicore format
import infinicore
from infinilm.infer_engine import GenerationConfig
input_ids_list = [tokens] input_ids_list = [tokens]
input_ids = infinicore.from_list(input_ids_list) input_ids = infinicore.from_list(input_ids_list)
...@@ -370,6 +375,124 @@ class TorchBenchmark(BaseBenchmark): ...@@ -370,6 +375,124 @@ class TorchBenchmark(BaseBenchmark):
print("Torch model destroyed") print("Torch model destroyed")
class VLLMBenchmark(BaseBenchmark):
"""vLLM backend using vllm.LLM"""
def __init__(
self,
model_dir_path,
device_type_str="nvidia",
tensor_parallel_size=1,
benchmark="ceval",
):
import transformers
from vllm import LLM
if device_type_str == "cpu":
raise ValueError("vLLM backend does not support CPU device type.")
self.benchmark = benchmark
# ---- tokenizer ----
with open(os.path.join(model_dir_path, "config.json"), "r") as f:
import json
self.config_dict = json.load(f)
model_type = self.config_dict.get("model_type", "")
if model_type in ["qwen2", "qwen3"]:
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)
else:
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)
eos_token_id = self.config_dict.get("eos_token_id")
self.eos_token_id = (
[eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
)
# ---- vLLM engine ----
print("Loading model with vLLM backend...")
self.llm = LLM(
model=model_dir_path,
tensor_parallel_size=tensor_parallel_size,
trust_remote_code=True,
)
print("vLLM model loaded successfully")
def max_context_len(self):
return self.config_dict.get("max_position_embeddings", 2048)
def render_input_content(self, *args, **kwargs):
if self.benchmark == "ceval":
return render_ceval(self.tokenizer, *args, **kwargs)
elif self.benchmark == "mmlu":
return render_mmlu(self.tokenizer, *args, **kwargs)
else:
raise ValueError(f"Unknown benchmark: {self.benchmark}")
def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
input_content = self.render_input_content(*args)
print(input_content, end="", flush=True)
tokens = self.encode_text(input_content)
return self._generate_step(tokens, max_steps, topp_, topk_, temperature_)
def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
from vllm import SamplingParams
prompt = self.tokenizer.decode(tokens)
sampling_params = SamplingParams(
max_tokens=max_steps,
temperature=temperature_,
top_p=topp_,
top_k=topk_,
stop_token_ids=self.eos_token_id,
)
start_time = time.perf_counter()
outputs = self.llm.generate(
prompts=[prompt],
sampling_params=sampling_params,
)
end_time = time.perf_counter()
# ---- post process ----
output_text = outputs[0].outputs[0].text
# ---- stats ----
input_tokens = len(tokens)
new_tokens = len(self.encode_text(output_text))
total_tokens = input_tokens + new_tokens
total_time = end_time - start_time
throughput = total_tokens / total_time if total_time > 0 else 0.0
print(output_text)
print()
print(f"Total time: {total_time * 1000:.2f} ms")
print(f"Input tokens: {input_tokens}")
print(f"New tokens: {new_tokens}")
print(f"Total tokens processed: {total_tokens}")
print(f"Throughput: {throughput:.2f} tok/s")
global TOTAL_TOKENS, TOTAL_TIME
TOTAL_TOKENS += total_tokens
TOTAL_TIME += total_time
return output_text
def destroy_model_instance(self):
del self.llm
print("vLLM model destroyed")
def render_ceval(_tokenizer, conversation): def render_ceval(_tokenizer, conversation):
"""Render C-Eval conversation to input content""" """Render C-Eval conversation to input content"""
return ( return (
...@@ -397,13 +520,16 @@ def render_mmlu(_tokenizer, question, choices): ...@@ -397,13 +520,16 @@ def render_mmlu(_tokenizer, question, choices):
if hasattr(_tokenizer, "apply_chat_template"): if hasattr(_tokenizer, "apply_chat_template"):
conversation = [ conversation = [
{"role": "system", "content": instruction}, {"role": "system", "content": instruction},
{"role": "user", "content": f"{question}\n{choices_text}\nAnswer:"}, {"role": "user", "content": f"{question}\n{choices_text}\n"},
] ]
try: try:
return _tokenizer.apply_chat_template( return (
conversation=conversation, _tokenizer.apply_chat_template(
add_generation_prompt=True, conversation=conversation,
tokenize=False, add_generation_prompt=True,
tokenize=False,
)
+ "The answer is: "
) )
except Exception: except Exception:
return prompt return prompt
...@@ -663,7 +789,7 @@ def test(): ...@@ -663,7 +789,7 @@ def test():
# Parse arguments manually to handle device flags properly # Parse arguments manually to handle device flags properly
if len(sys.argv) < 4: if len(sys.argv) < 4:
print( print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]" "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch|vllm] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
) )
sys.exit(1) sys.exit(1)
...@@ -750,7 +876,7 @@ def test(): ...@@ -750,7 +876,7 @@ def test():
device_type_str = "ali" device_type_str = "ali"
else: else:
print( print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]" "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch|vllm] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
) )
sys.exit(1) sys.exit(1)
...@@ -773,7 +899,10 @@ def test(): ...@@ -773,7 +899,10 @@ def test():
# Create model based on backend (create once, reuse for all subjects) # Create model based on backend (create once, reuse for all subjects)
if backend == "torch": if backend == "torch":
assert ndev == 1, "Torch backend only supports single-device evaluation"
model = TorchBenchmark(model_path, device_type_str, benchmark) model = TorchBenchmark(model_path, device_type_str, benchmark)
elif backend == "vllm":
model = VLLMBenchmark(model_path, device_type_str, ndev, benchmark)
else: else:
model = InfiniLMBenchmark( model = InfiniLMBenchmark(
model_path, device_type_str, ndev, backend, benchmark, enable_paged_attn model_path, device_type_str, ndev, backend, benchmark, enable_paged_attn
...@@ -944,7 +1073,9 @@ def test(): ...@@ -944,7 +1073,9 @@ def test():
splits_to_load = ( splits_to_load = (
["test"] ["test"]
if split == "test" if split == "test"
else ["validation"] if split == "val" else ["validation", "test"] else ["validation"]
if split == "val"
else ["validation", "test"]
) )
# Load each subject individually from hardcoded list, excluding "all" # Load each subject individually from hardcoded list, excluding "all"
for subject_name in mmlu_subjects: for subject_name in mmlu_subjects:
...@@ -966,7 +1097,9 @@ def test(): ...@@ -966,7 +1097,9 @@ def test():
splits_to_load = ( splits_to_load = (
["test"] ["test"]
if split == "test" if split == "test"
else ["validation"] if split == "val" else ["validation", "test"] else ["validation"]
if split == "val"
else ["validation", "test"]
) )
records = [] records = []
for sp in splits_to_load: for sp in splits_to_load:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment