Commit 00d3d196 authored by zhuwenwen's avatar zhuwenwen
Browse files

update benchmarks

parent 2f5f98bb
...@@ -375,18 +375,32 @@ def main(args: argparse.Namespace): ...@@ -375,18 +375,32 @@ def main(args: argparse.Namespace):
args.output_len) args.output_len)
if args.backend == "vllm": if args.backend == "vllm":
run_args = [ if args.async_engine:
warmup_requests, requests, args.model, args.tokenizer, args.quantization, run_args = [
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, requests, args.model, args.tokenizer, args.quantization,
args.trust_remote_code, args.dtype, args.max_model_len, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.enforce_eager, args.kv_cache_dtype, args.trust_remote_code, args.dtype, args.max_model_len,
args.quantization_param_path, args.device, args.enforce_eager, args.kv_cache_dtype,
args.enable_prefix_caching, args.enable_chunked_prefill, args.quantization_param_path, args.device,
args.max_num_batched_tokens, args.distributed_executor_backend, args.enable_prefix_caching, args.enable_chunked_prefill,
args.gpu_memory_utilization, args.num_scheduler_steps, args.max_num_batched_tokens, args.distributed_executor_backend,
args.use_v2_block_manager, args.download_dir, args.load_format, args.gpu_memory_utilization, args.num_scheduler_steps,
args.disable_async_output_proc args.use_v2_block_manager, args.download_dir, args.load_format,
] args.disable_async_output_proc
]
else:
run_args = [
warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
]
if args.async_engine: if args.async_engine:
run_args.append(args.disable_frontend_multiprocessing) run_args.append(args.disable_frontend_multiprocessing)
......
...@@ -9,7 +9,7 @@ ray >= 2.10.0 ...@@ -9,7 +9,7 @@ ray >= 2.10.0
peft peft
pytest-asyncio pytest-asyncio
tensorizer>=2.9.0 tensorizer>=2.9.0
setuptools_scm setuptools_scm>=8
torch == 2.3.0 torch == 2.3.0
triton == 2.1.0 triton == 2.1.0
......
...@@ -375,18 +375,32 @@ def main(args: argparse.Namespace): ...@@ -375,18 +375,32 @@ def main(args: argparse.Namespace):
args.output_len) args.output_len)
if args.backend == "vllm": if args.backend == "vllm":
run_args = [ if args.async_engine:
warmup_requests, requests, args.model, args.tokenizer, args.quantization, run_args = [
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, requests, args.model, args.tokenizer, args.quantization,
args.trust_remote_code, args.dtype, args.max_model_len, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.enforce_eager, args.kv_cache_dtype, args.trust_remote_code, args.dtype, args.max_model_len,
args.quantization_param_path, args.device, args.enforce_eager, args.kv_cache_dtype,
args.enable_prefix_caching, args.enable_chunked_prefill, args.quantization_param_path, args.device,
args.max_num_batched_tokens, args.distributed_executor_backend, args.enable_prefix_caching, args.enable_chunked_prefill,
args.gpu_memory_utilization, args.num_scheduler_steps, args.max_num_batched_tokens, args.distributed_executor_backend,
args.use_v2_block_manager, args.download_dir, args.load_format, args.gpu_memory_utilization, args.num_scheduler_steps,
args.disable_async_output_proc args.use_v2_block_manager, args.download_dir, args.load_format,
] args.disable_async_output_proc
]
else:
run_args = [
warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
]
if args.async_engine: if args.async_engine:
run_args.append(args.disable_frontend_multiprocessing) run_args.append(args.disable_frontend_multiprocessing)
......
...@@ -289,9 +289,15 @@ class ModelConfig: ...@@ -289,9 +289,15 @@ class ModelConfig:
quantization_override = method.override_quantization_method( quantization_override = method.override_quantization_method(
quant_cfg, self.quantization) quant_cfg, self.quantization)
if quantization_override: if quantization_override:
quant_method = quantization_override if is_hip():
self.quantization = quantization_override if quantization_override in rocm_supported_quantization:
break quant_method = quantization_override
self.quantization = quantization_override
break
else:
quant_method = quantization_override
self.quantization = quantization_override
break
# Verify quantization configurations. # Verify quantization configurations.
if self.quantization is None: if self.quantization is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment