"tools/vscode:/vscode.git/clone" did not exist on "ec17136e6ed7d6be9a60462cc0e3d2d3cba37320"
Unverified Commit 9710f718 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

[Eval] Add `--repeat` in `run_eval` (#11101)

parent 91847e38
...@@ -10,11 +10,29 @@ import time ...@@ -10,11 +10,29 @@ import time
from sglang.test.simple_eval_common import ( from sglang.test.simple_eval_common import (
ChatCompletionSampler, ChatCompletionSampler,
Eval,
make_report, make_report,
set_ulimit, set_ulimit,
) )
def run_eval_once(args, base_url: str, eval_obj: Eval) -> dict:
sampler = ChatCompletionSampler(
model=args.model,
max_tokens=getattr(args, "max_tokens", 2048),
base_url=base_url,
temperature=getattr(args, "temperature", 0.0),
reasoning_effort=getattr(args, "reasoning_effort", None),
)
# Run eval
tic = time.perf_counter()
result = eval_obj(sampler)
latency = time.perf_counter() - tic
return result, latency, sampler
def run_eval(args): def run_eval(args):
set_ulimit() set_ulimit()
...@@ -68,18 +86,32 @@ def run_eval(args): ...@@ -68,18 +86,32 @@ def run_eval(args):
else: else:
raise ValueError(f"Invalid eval name: {args.eval_name}") raise ValueError(f"Invalid eval name: {args.eval_name}")
sampler = ChatCompletionSampler( if getattr(args, "repeat", 1) == 1:
model=args.model, result, latency, sampler = run_eval_once(args, base_url, eval_obj)
max_tokens=getattr(args, "max_tokens", 2048), else:
base_url=base_url, from concurrent.futures import ThreadPoolExecutor
temperature=getattr(args, "temperature", 0.0),
reasoning_effort=getattr(args, "reasoning_effort", None),
)
# Run eval executor = ThreadPoolExecutor(max_workers=args.repeat)
tic = time.perf_counter()
result = eval_obj(sampler) futures = [
latency = time.perf_counter() - tic executor.submit(run_eval_once, args, base_url, eval_obj)
for _ in range(args.repeat)
]
scores_repeat = []
for f in futures:
result, latency, sampler = f.result()
scores_repeat.append(result.score)
mean_score = sum(scores_repeat) / len(scores_repeat)
scores_repeat = [f"{s:.3f}" for s in scores_repeat]
print("=" * 20)
print(f"Repeat: {args.repeat}, mean: {mean_score:.3f}")
print(f"Scores: {scores_repeat}")
print("=" * 20)
executor.shutdown()
# Dump reports # Dump reports
metrics = result.metrics | {"score": result.score} metrics = result.metrics | {"score": result.score}
...@@ -125,6 +157,9 @@ if __name__ == "__main__": ...@@ -125,6 +157,9 @@ if __name__ == "__main__":
type=str, type=str,
help="Name or path of the model. If not set, the default model will request /v1/models for conf.", help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
) )
parser.add_argument(
"--repeat", type=int, default=1, help="repeat the evaluation n times"
)
parser.add_argument("--eval-name", type=str, default="mmlu") parser.add_argument("--eval-name", type=str, default="mmlu")
parser.add_argument("--num-examples", type=int) parser.add_argument("--num-examples", type=int)
parser.add_argument("--num-threads", type=int, default=512) parser.add_argument("--num-threads", type=int, default=512)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment