Commit 00d3d196 authored by zhuwenwen's avatar zhuwenwen
Browse files

update benchmarks

parent 2f5f98bb
......@@ -375,6 +375,20 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
if args.async_engine:
run_args = [
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
]
else:
run_args = [
warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
......
......@@ -9,7 +9,7 @@ ray >= 2.10.0
peft
pytest-asyncio
tensorizer>=2.9.0
setuptools_scm
setuptools_scm>=8
torch == 2.3.0
triton == 2.1.0
......
......@@ -375,6 +375,20 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
if args.async_engine:
run_args = [
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
]
else:
run_args = [
warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
......
......@@ -289,6 +289,12 @@ class ModelConfig:
quantization_override = method.override_quantization_method(
quant_cfg, self.quantization)
if quantization_override:
if is_hip():
if quantization_override in rocm_supported_quantization:
quant_method = quantization_override
self.quantization = quantization_override
break
else:
quant_method = quantization_override
self.quantization = quantization_override
break
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment