Unverified Commit 6768ff4a authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Move the last arguments in `arg_utils.py` to be in their final groups (#17531)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent f2e7af9b
...@@ -1637,7 +1637,7 @@ class ParallelConfig: ...@@ -1637,7 +1637,7 @@ class ParallelConfig:
"""Use expert parallelism instead of tensor parallelism for MoE layers.""" """Use expert parallelism instead of tensor parallelism for MoE layers."""
max_parallel_loading_workers: Optional[int] = None max_parallel_loading_workers: Optional[int] = None
"""Maximum number of parallal loading workers when loading model """Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models.""" parallel and large models."""
......
...@@ -474,15 +474,21 @@ class EngineArgs: ...@@ -474,15 +474,21 @@ class EngineArgs:
title="LoadConfig", title="LoadConfig",
description=LoadConfig.__doc__, description=LoadConfig.__doc__,
) )
load_group.add_argument('--load-format', load_group.add_argument("--load-format",
choices=[f.value for f in LoadFormat], choices=[f.value for f in LoadFormat],
**load_kwargs["load_format"]) **load_kwargs["load_format"])
load_group.add_argument('--download-dir', load_group.add_argument("--download-dir",
**load_kwargs["download_dir"]) **load_kwargs["download_dir"])
load_group.add_argument('--model-loader-extra-config', load_group.add_argument("--model-loader-extra-config",
**load_kwargs["model_loader_extra_config"]) **load_kwargs["model_loader_extra_config"])
load_group.add_argument('--use-tqdm-on-load', load_group.add_argument("--ignore-patterns",
**load_kwargs["ignore_patterns"])
load_group.add_argument("--use-tqdm-on-load",
**load_kwargs["use_tqdm_on_load"]) **load_kwargs["use_tqdm_on_load"])
load_group.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
# Guided decoding arguments # Guided decoding arguments
guided_decoding_kwargs = get_kwargs(DecodingConfig) guided_decoding_kwargs = get_kwargs(DecodingConfig)
...@@ -501,6 +507,14 @@ class EngineArgs: ...@@ -501,6 +507,14 @@ class EngineArgs:
guided_decoding_group.add_argument( guided_decoding_group.add_argument(
"--guided-decoding-disable-additional-properties", "--guided-decoding-disable-additional-properties",
**guided_decoding_kwargs["disable_additional_properties"]) **guided_decoding_kwargs["disable_additional_properties"])
guided_decoding_group.add_argument(
"--enable-reasoning",
action=argparse.BooleanOptionalAction,
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
"of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
"parser backend insteadThis flag (`--enable-reasoning`) will be "
"removed in v0.10.0. When `--reasoning-parser` is specified, "
"reasoning mode is automatically enabled.")
guided_decoding_group.add_argument( guided_decoding_group.add_argument(
"--reasoning-parser", "--reasoning-parser",
# This choices is a special case because it's not static # This choices is a special case because it's not static
...@@ -514,27 +528,31 @@ class EngineArgs: ...@@ -514,27 +528,31 @@ class EngineArgs:
description=ParallelConfig.__doc__, description=ParallelConfig.__doc__,
) )
parallel_group.add_argument( parallel_group.add_argument(
'--distributed-executor-backend', "--distributed-executor-backend",
**parallel_kwargs["distributed_executor_backend"]) **parallel_kwargs["distributed_executor_backend"])
parallel_group.add_argument( parallel_group.add_argument(
'--pipeline-parallel-size', '-pp', "--pipeline-parallel-size", "-pp",
**parallel_kwargs["pipeline_parallel_size"]) **parallel_kwargs["pipeline_parallel_size"])
parallel_group.add_argument('--tensor-parallel-size', '-tp', parallel_group.add_argument("--tensor-parallel-size", "-tp",
**parallel_kwargs["tensor_parallel_size"]) **parallel_kwargs["tensor_parallel_size"])
parallel_group.add_argument('--data-parallel-size', '-dp', parallel_group.add_argument("--data-parallel-size", "-dp",
**parallel_kwargs["data_parallel_size"]) **parallel_kwargs["data_parallel_size"])
parallel_group.add_argument( parallel_group.add_argument(
'--enable-expert-parallel', "--enable-expert-parallel",
**parallel_kwargs["enable_expert_parallel"]) **parallel_kwargs["enable_expert_parallel"])
parallel_group.add_argument( parallel_group.add_argument(
'--max-parallel-loading-workers', "--max-parallel-loading-workers",
**parallel_kwargs["max_parallel_loading_workers"]) **parallel_kwargs["max_parallel_loading_workers"])
parallel_group.add_argument( parallel_group.add_argument(
'--ray-workers-use-nsight', "--ray-workers-use-nsight",
**parallel_kwargs["ray_workers_use_nsight"]) **parallel_kwargs["ray_workers_use_nsight"])
parallel_group.add_argument( parallel_group.add_argument(
'--disable-custom-all-reduce', "--disable-custom-all-reduce",
**parallel_kwargs["disable_custom_all_reduce"]) **parallel_kwargs["disable_custom_all_reduce"])
parallel_group.add_argument("--worker-cls",
**parallel_kwargs["worker_cls"])
parallel_group.add_argument("--worker-extension-cls",
**parallel_kwargs["worker_extension_cls"])
# KV cache arguments # KV cache arguments
cache_kwargs = get_kwargs(CacheConfig) cache_kwargs = get_kwargs(CacheConfig)
...@@ -542,47 +560,34 @@ class EngineArgs: ...@@ -542,47 +560,34 @@ class EngineArgs:
title="CacheConfig", title="CacheConfig",
description=CacheConfig.__doc__, description=CacheConfig.__doc__,
) )
cache_group.add_argument('--block-size', **cache_kwargs["block_size"]) cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
cache_group.add_argument('--gpu-memory-utilization', cache_group.add_argument("--gpu-memory-utilization",
**cache_kwargs["gpu_memory_utilization"]) **cache_kwargs["gpu_memory_utilization"])
cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"]) cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument('--kv-cache-dtype', cache_group.add_argument("--kv-cache-dtype",
**cache_kwargs["cache_dtype"]) **cache_kwargs["cache_dtype"])
cache_group.add_argument('--num-gpu-blocks-override', cache_group.add_argument("--num-gpu-blocks-override",
**cache_kwargs["num_gpu_blocks_override"]) **cache_kwargs["num_gpu_blocks_override"])
cache_group.add_argument("--enable-prefix-caching", cache_group.add_argument("--enable-prefix-caching",
**cache_kwargs["enable_prefix_caching"]) **cache_kwargs["enable_prefix_caching"])
cache_group.add_argument("--prefix-caching-hash-algo", cache_group.add_argument("--prefix-caching-hash-algo",
**cache_kwargs["prefix_caching_hash_algo"]) **cache_kwargs["prefix_caching_hash_algo"])
cache_group.add_argument('--cpu-offload-gb', cache_group.add_argument("--cpu-offload-gb",
**cache_kwargs["cpu_offload_gb"]) **cache_kwargs["cpu_offload_gb"])
cache_group.add_argument('--calculate-kv-scales', cache_group.add_argument("--calculate-kv-scales",
**cache_kwargs["calculate_kv_scales"]) **cache_kwargs["calculate_kv_scales"])
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=True,
help='[DEPRECATED] block manager v1 has been '
'removed and SelfAttnBlockSpaceManager (i.e. '
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.')
parser.add_argument('--disable-log-stats',
action='store_true',
help='Disable logging statistics.')
# Tokenizer arguments # Tokenizer arguments
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig) tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
tokenizer_group = parser.add_argument_group( tokenizer_group = parser.add_argument_group(
title="TokenizerPoolConfig", title="TokenizerPoolConfig",
description=TokenizerPoolConfig.__doc__, description=TokenizerPoolConfig.__doc__,
) )
tokenizer_group.add_argument('--tokenizer-pool-size', tokenizer_group.add_argument("--tokenizer-pool-size",
**tokenizer_kwargs["pool_size"]) **tokenizer_kwargs["pool_size"])
tokenizer_group.add_argument('--tokenizer-pool-type', tokenizer_group.add_argument("--tokenizer-pool-type",
**tokenizer_kwargs["pool_type"]) **tokenizer_kwargs["pool_type"])
tokenizer_group.add_argument('--tokenizer-pool-extra-config', tokenizer_group.add_argument("--tokenizer-pool-extra-config",
**tokenizer_kwargs["extra_config"]) **tokenizer_kwargs["extra_config"])
# Multimodal related configs # Multimodal related configs
...@@ -591,13 +596,13 @@ class EngineArgs: ...@@ -591,13 +596,13 @@ class EngineArgs:
title="MultiModalConfig", title="MultiModalConfig",
description=MultiModalConfig.__doc__, description=MultiModalConfig.__doc__,
) )
multimodal_group.add_argument('--limit-mm-per-prompt', multimodal_group.add_argument("--limit-mm-per-prompt",
**multimodal_kwargs["limit_per_prompt"]) **multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument( multimodal_group.add_argument(
'--mm-processor-kwargs', "--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"]) **multimodal_kwargs["mm_processor_kwargs"])
multimodal_group.add_argument( multimodal_group.add_argument(
'--disable-mm-preprocessor-cache', "--disable-mm-preprocessor-cache",
**multimodal_kwargs["disable_mm_preprocessor_cache"]) **multimodal_kwargs["disable_mm_preprocessor_cache"])
# LoRA related configs # LoRA related configs
...@@ -607,25 +612,25 @@ class EngineArgs: ...@@ -607,25 +612,25 @@ class EngineArgs:
description=LoRAConfig.__doc__, description=LoRAConfig.__doc__,
) )
lora_group.add_argument( lora_group.add_argument(
'--enable-lora', "--enable-lora",
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
help='If True, enable handling of LoRA adapters.') help="If True, enable handling of LoRA adapters.")
lora_group.add_argument('--enable-lora-bias', lora_group.add_argument("--enable-lora-bias",
**lora_kwargs["bias_enabled"]) **lora_kwargs["bias_enabled"])
lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"]) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
lora_group.add_argument('--max-lora-rank', lora_group.add_argument("--max-lora-rank",
**lora_kwargs["max_lora_rank"]) **lora_kwargs["max_lora_rank"])
lora_group.add_argument('--lora-extra-vocab-size', lora_group.add_argument("--lora-extra-vocab-size",
**lora_kwargs["lora_extra_vocab_size"]) **lora_kwargs["lora_extra_vocab_size"])
lora_group.add_argument( lora_group.add_argument(
'--lora-dtype', "--lora-dtype",
**lora_kwargs["lora_dtype"], **lora_kwargs["lora_dtype"],
) )
lora_group.add_argument('--long-lora-scaling-factors', lora_group.add_argument("--long-lora-scaling-factors",
**lora_kwargs["long_lora_scaling_factors"]) **lora_kwargs["long_lora_scaling_factors"])
lora_group.add_argument('--max-cpu-loras', lora_group.add_argument("--max-cpu-loras",
**lora_kwargs["max_cpu_loras"]) **lora_kwargs["max_cpu_loras"])
lora_group.add_argument('--fully-sharded-loras', lora_group.add_argument("--fully-sharded-loras",
**lora_kwargs["fully_sharded_loras"]) **lora_kwargs["fully_sharded_loras"])
# PromptAdapter related configs # PromptAdapter related configs
...@@ -635,14 +640,14 @@ class EngineArgs: ...@@ -635,14 +640,14 @@ class EngineArgs:
description=PromptAdapterConfig.__doc__, description=PromptAdapterConfig.__doc__,
) )
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--enable-prompt-adapter', "--enable-prompt-adapter",
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
help='If True, enable handling of PromptAdapters.') help="If True, enable handling of PromptAdapters.")
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--max-prompt-adapters', "--max-prompt-adapters",
**prompt_adapter_kwargs["max_prompt_adapters"]) **prompt_adapter_kwargs["max_prompt_adapters"])
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--max-prompt-adapter-token', "--max-prompt-adapter-token",
**prompt_adapter_kwargs["max_prompt_adapter_token"]) **prompt_adapter_kwargs["max_prompt_adapter_token"])
# Device arguments # Device arguments
...@@ -659,25 +664,11 @@ class EngineArgs: ...@@ -659,25 +664,11 @@ class EngineArgs:
description=SpeculativeConfig.__doc__, description=SpeculativeConfig.__doc__,
) )
speculative_group.add_argument( speculative_group.add_argument(
'--speculative-config', "--speculative-config",
type=json.loads, type=json.loads,
default=None, default=None,
help='The configurations for speculative decoding.' help="The configurations for speculative decoding. Should be a "
' Should be a JSON string.') "JSON string.")
parser.add_argument(
'--ignore-patterns',
action="append",
type=str,
default=[],
help="The pattern(s) to ignore when loading the model."
"Default to `original/**/*` to avoid repeated loading of llama's "
"checkpoints.")
parser.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
# Observability arguments # Observability arguments
observability_kwargs = get_kwargs(ObservabilityConfig) observability_kwargs = get_kwargs(ObservabilityConfig)
...@@ -710,9 +701,9 @@ class EngineArgs: ...@@ -710,9 +701,9 @@ class EngineArgs:
description=SchedulerConfig.__doc__, description=SchedulerConfig.__doc__,
) )
scheduler_group.add_argument( scheduler_group.add_argument(
'--max-num-batched-tokens', "--max-num-batched-tokens",
**scheduler_kwargs["max_num_batched_tokens"]) **scheduler_kwargs["max_num_batched_tokens"])
scheduler_group.add_argument('--max-num-seqs', scheduler_group.add_argument("--max-num-seqs",
**scheduler_kwargs["max_num_seqs"]) **scheduler_kwargs["max_num_seqs"])
scheduler_group.add_argument( scheduler_group.add_argument(
"--max-num-partial-prefills", "--max-num-partial-prefills",
...@@ -723,70 +714,78 @@ class EngineArgs: ...@@ -723,70 +714,78 @@ class EngineArgs:
scheduler_group.add_argument( scheduler_group.add_argument(
"--long-prefill-token-threshold", "--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"]) **scheduler_kwargs["long_prefill_token_threshold"])
scheduler_group.add_argument('--num-lookahead-slots', scheduler_group.add_argument("--num-lookahead-slots",
**scheduler_kwargs["num_lookahead_slots"]) **scheduler_kwargs["num_lookahead_slots"])
scheduler_group.add_argument('--scheduler-delay-factor', scheduler_group.add_argument("--scheduler-delay-factor",
**scheduler_kwargs["delay_factor"]) **scheduler_kwargs["delay_factor"])
scheduler_group.add_argument('--preemption-mode', scheduler_group.add_argument("--preemption-mode",
**scheduler_kwargs["preemption_mode"]) **scheduler_kwargs["preemption_mode"])
scheduler_group.add_argument('--num-scheduler-steps', scheduler_group.add_argument("--num-scheduler-steps",
**scheduler_kwargs["num_scheduler_steps"]) **scheduler_kwargs["num_scheduler_steps"])
scheduler_group.add_argument( scheduler_group.add_argument(
'--multi-step-stream-outputs', "--multi-step-stream-outputs",
**scheduler_kwargs["multi_step_stream_outputs"]) **scheduler_kwargs["multi_step_stream_outputs"])
scheduler_group.add_argument('--scheduling-policy', scheduler_group.add_argument("--scheduling-policy",
**scheduler_kwargs["policy"]) **scheduler_kwargs["policy"])
scheduler_group.add_argument( scheduler_group.add_argument(
'--enable-chunked-prefill', "--enable-chunked-prefill",
**scheduler_kwargs["enable_chunked_prefill"]) **scheduler_kwargs["enable_chunked_prefill"])
scheduler_group.add_argument( scheduler_group.add_argument(
"--disable-chunked-mm-input", "--disable-chunked-mm-input",
**scheduler_kwargs["disable_chunked_mm_input"]) **scheduler_kwargs["disable_chunked_mm_input"])
parser.add_argument('--scheduler-cls', scheduler_group.add_argument("--scheduler-cls",
**scheduler_kwargs["scheduler_cls"]) **scheduler_kwargs["scheduler_cls"])
parser.add_argument('--compilation-config', # Compilation arguments
'-O', # compilation_kwargs = get_kwargs(CompilationConfig)
type=CompilationConfig.from_cli, compilation_group = parser.add_argument_group(
default=None, title="CompilationConfig",
help='torch.compile configuration for the model. ' description=CompilationConfig.__doc__,
'When it is a number (0, 1, 2, 3), it will be ' )
'interpreted as the optimization level.\n' compilation_group.add_argument(
'NOTE: level 0 is the default level without ' "--compilation-config",
'any optimization. level 1 and 2 are for internal ' "-O",
'testing only. level 3 is the recommended level ' type=CompilationConfig.from_cli,
'for production.\n' default=None,
'To specify the full compilation config, ' help="torch.compile configuration for the model. "
'use a JSON string, e.g. ``{"level": 3, ' "When it is a number (0, 1, 2, 3), it will be "
'"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n' "interpreted as the optimization level.\n"
'Following the convention of traditional ' "NOTE: level 0 is the default level without "
'compilers, using ``-O`` without space is also ' "any optimization. level 1 and 2 are for internal "
'supported. ``-O3`` is equivalent to ``-O 3``.') "testing only. level 3 is the recommended level "
"for production.\n"
parser.add_argument('--kv-transfer-config', "To specify the full compilation config, "
type=KVTransferConfig.from_cli, "use a JSON string, e.g. ``{\"level\": 3, "
default=None, "\"cudagraph_capture_sizes\": [1, 2, 4, 8]}``\n"
help='The configurations for distributed KV cache ' "Following the convention of traditional "
'transfer. Should be a JSON string.') "compilers, using ``-O`` without space is also "
parser.add_argument('--kv-events-config', "supported. ``-O3`` is equivalent to ``-O 3``.")
type=KVEventsConfig.from_cli,
default=None, # KVTransfer arguments
help='The configurations for event publishing.') # kv_transfer_kwargs = get_kwargs(KVTransferConfig)
kv_transfer_group = parser.add_argument_group(
parser.add_argument( title="KVTransferConfig",
'--worker-cls', description=KVTransferConfig.__doc__,
type=str, )
default="auto", kv_transfer_group.add_argument(
help='The worker class to use for distributed execution.') "--kv-transfer-config",
parser.add_argument( type=KVTransferConfig.from_cli,
'--worker-extension-cls', default=None,
type=str, help="The configurations for distributed KV cache "
default="", "transfer. Should be a JSON string.")
help='The worker extension class on top of the worker cls, ' kv_transfer_group.add_argument(
'it is useful if you just want to add new functions to the worker ' '--kv-events-config',
'class without changing the existing functions.') type=KVEventsConfig.from_cli,
default=None,
parser.add_argument( help='The configurations for event publishing.')
# vLLM arguments
# vllm_kwargs = get_kwargs(VllmConfig)
vllm_group = parser.add_argument_group(
title="VllmConfig",
description=VllmConfig.__doc__,
)
vllm_group.add_argument(
"--additional-config", "--additional-config",
type=json.loads, type=json.loads,
default=None, default=None,
...@@ -795,20 +794,18 @@ class EngineArgs: ...@@ -795,20 +794,18 @@ class EngineArgs:
"configs are valid for the platform you are using. The input format" "configs are valid for the platform you are using. The input format"
" is like '{\"config_key\":\"config_value\"}'") " is like '{\"config_key\":\"config_value\"}'")
parser.add_argument( # Other arguments
"--enable-reasoning", parser.add_argument('--use-v2-block-manager',
action="store_true", action='store_true',
default=False, default=True,
help= help='[DEPRECATED] block manager v1 has been '
"[DEPRECATED] " \ 'removed and SelfAttnBlockSpaceManager (i.e. '
"The --enable-reasoning flag is deprecated as of v0.8.6. " 'block manager v2) is now the default. '
"Use --reasoning-parser to specify " \ 'Setting this flag to True or False'
"the reasoning parser backend instead. " ' has no effect on vLLM behavior.')
"This flag (--enable-reasoning) will be " \ parser.add_argument('--disable-log-stats',
"removed in v0.10.0. " action='store_true',
"When --reasoning-parser is specified, " \ help='Disable logging statistics.')
"reasoning mode is automatically enabled."
)
return parser return parser
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment