"vllm/executor/ray_distributed_executor.py" did not exist on "dda4811591fdb90d263bc9b8ac522436369aef13"
Unverified Commit a27b288e authored by rongfu.leng's avatar rongfu.leng Committed by GitHub
Browse files

[Feature] default --extra-body param to disable thinking in vllm bench serve (#26784)


Signed-off-by: default avatarrongfu.leng <rongfu.leng@daocloud.io>
parent e471d7ca
...@@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
"the ready check will be skipped.", "the ready check will be skipped.",
) )
parser.add_argument(
"--extra-body",
help="A JSON string representing extra body parameters to include "
"in each request."
'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'',
type=json.loads,
default=None,
)
def main(args: argparse.Namespace) -> dict[str, Any]: def main(args: argparse.Namespace) -> dict[str, Any]:
return asyncio.run(main_async(args)) return asyncio.run(main_async(args))
...@@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ...@@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
else: else:
sampling_params = {} sampling_params = {}
extra_body = args.extra_body or {}
extra_body = {**sampling_params, **extra_body}
# Avoid GC processing "static" data - reduce pause times. # Avoid GC processing "static" data - reduce pause times.
gc.collect() gc.collect()
gc.freeze() gc.freeze()
...@@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ...@@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules, lora_modules=args.lora_modules,
extra_headers=headers, extra_headers=headers,
extra_body=sampling_params, extra_body=extra_body,
ramp_up_strategy=args.ramp_up_strategy, ramp_up_strategy=args.ramp_up_strategy,
ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_start_rps=args.ramp_up_start_rps,
ramp_up_end_rps=args.ramp_up_end_rps, ramp_up_end_rps=args.ramp_up_end_rps,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment