import argparse import dataclasses from dataclasses import dataclass from typing import Optional from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import str_to_int_tuple def nullable_str(val: str): if not val or val == "None": return None return val @dataclass class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' quantization_param_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 enable_prefix_caching: bool = False use_v2_block_manager: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None max_num_seqs: int = 256 max_logprobs: int = 5 # OpenAI default value disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: bool = False max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None enable_lora: bool = False max_loras: int = 1 max_lora_rank: int = 16 fully_sharded_loras: bool = False lora_extra_vocab_size: int = 256 lora_dtype = 'auto' max_cpu_loras: Optional[int] = None device: str = 'auto' ray_workers_use_nsight: bool = False num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 model_loader_extra_config: Optional[dict] = None # Related to Vision-language models such as llava image_input_type: Optional[str] = None image_token_id: Optional[int] = None image_input_shape: Optional[str] = None image_feature_size: Optional[int] = None scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False guided_decoding_backend: str = 'outlines' # Speculative decoding configuration. speculative_model: Optional[str] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None ngram_prompt_lookup_max: Optional[int] = None ngram_prompt_lookup_min: Optional[int] = None def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @staticmethod def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Shared CLI arguments for vLLM engine.""" # Model arguments parser.add_argument( '--model', type=str, default='facebook/opt-125m', help='Name or path of the huggingface model to use.') parser.add_argument( '--tokenizer', type=nullable_str, default=EngineArgs.tokenizer, help='Name or path of the huggingface tokenizer to use.') parser.add_argument( '--skip-tokenizer-init', action='store_true', help='Skip initialization of tokenizer and detokenizer') parser.add_argument( '--revision', type=nullable_str, default=None, help='The specific model version to use. It can be a branch ' 'name, a tag name, or a commit id. If unspecified, will use ' 'the default version.') parser.add_argument( '--code-revision', type=nullable_str, default=None, help='The specific revision to use for the model code on ' 'Hugging Face Hub. It can be a branch name, a tag name, or a ' 'commit id. If unspecified, will use the default version.') parser.add_argument( '--tokenizer-revision', type=nullable_str, default=None, help='The specific tokenizer version to use. It can be a branch ' 'name, a tag name, or a commit id. If unspecified, will use ' 'the default version.') parser.add_argument( '--tokenizer-mode', type=str, default=EngineArgs.tokenizer_mode, choices=['auto', 'slow'], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' 'always use the slow tokenizer.') parser.add_argument('--trust-remote-code', action='store_true', help='Trust remote code from huggingface.') parser.add_argument('--download-dir', type=nullable_str, default=EngineArgs.download_dir, help='Directory to download and load the weights, ' 'default to the default cache dir of ' 'huggingface.') parser.add_argument( '--load-format', type=str, default=EngineArgs.load_format, choices=[ 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer' ], help='The format of the model weights to load.\n\n' '* "auto" will try to load the weights in the safetensors format ' 'and fall back to the pytorch bin format if safetensors format ' 'is not available.\n' '* "pt" will load the weights in the pytorch bin format.\n' '* "safetensors" will load the weights in the safetensors format.\n' '* "npcache" will load the weights in pytorch format and store ' 'a numpy cache to speed up the loading.\n' '* "dummy" will initialize the weights with random values, ' 'which is mainly for profiling.\n' '* "tensorizer" will load the weights using tensorizer from ' 'CoreWeave which assumes tensorizer_uri is set to the location of ' 'the serialized weights.') parser.add_argument( '--dtype', type=str, default=EngineArgs.dtype, choices=[ 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' ], help='Data type for model weights and activations.\n\n' '* "auto" will use FP16 precision for FP32 and FP16 models, and ' 'BF16 precision for BF16 models.\n' '* "half" for FP16. Recommended for AWQ quantization.\n' '* "float16" is the same as "half".\n' '* "bfloat16" for a balance between precision and range.\n' '* "float" is shorthand for FP32 precision.\n' '* "float32" for FP32 precision.') parser.add_argument( '--kv-cache-dtype', type=str, choices=['auto', 'fp8'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria.') parser.add_argument( '--quantization-param-path', type=nullable_str, default=None, help='Path to the JSON file containing the KV cache ' 'scaling factors. This should generally be supplied, when ' 'KV cache dtype is FP8. Otherwise, KV cache scaling factors ' 'default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version' 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, help='Model context length. If unspecified, will ' 'be automatically derived from the model config.') parser.add_argument( '--guided-decoding-backend', type=str, default='outlines', choices=['outlines', 'lm-format-enforcer'], help='Which engine will be used for guided decoding' ' (JSON schema / regex etc) by default. Currently support ' 'https://github.com/outlines-dev/outlines and ' 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' ' parameter.') # Parallel arguments parser.add_argument('--worker-use-ray', action='store_true', help='Use Ray for distributed serving, will be ' 'automatically set when using more than 1 GPU.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=EngineArgs.pipeline_parallel_size, help='Number of pipeline stages.') parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=EngineArgs.tensor_parallel_size, help='Number of tensor parallel replicas.') parser.add_argument( '--max-parallel-loading-workers', type=int, default=EngineArgs.max_parallel_loading_workers, help='Load model sequentially in multiple batches, ' 'to avoid RAM OOM when using tensor ' 'parallel and large models.') parser.add_argument( '--ray-workers-use-nsight', action='store_true', help='If specified, use nsight to profile Ray workers.') # KV cache arguments parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, choices=[8, 16, 32], help='Token block size for contiguous chunks of ' 'tokens.') parser.add_argument('--enable-prefix-caching', action='store_true', help='Enables automatic prefix caching.') parser.add_argument('--use-v2-block-manager', action='store_true', help='Use BlockSpaceMangerV2.') parser.add_argument( '--num-lookahead-slots', type=int, default=EngineArgs.num_lookahead_slots, help='Experimental scheduling config necessary for ' 'speculative decoding. This will be replaced by ' 'speculative config in the future; it is present ' 'to enable correctness tests until then.') parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='Random seed for operations.') parser.add_argument('--swap-space', type=int, default=EngineArgs.swap_space, help='CPU swap space size (GiB) per GPU.') parser.add_argument( '--gpu-memory-utilization', type=float, default=EngineArgs.gpu_memory_utilization, help='The fraction of GPU memory to be used for the model ' 'executor, which can range from 0 to 1. For example, a value of ' '0.5 would imply 50%% GPU memory utilization. If unspecified, ' 'will use the default value of 0.9.') parser.add_argument( '--num-gpu-blocks-override', type=int, default=None, help='If specified, ignore GPU profiling result and use this number' 'of GPU blocks. Used for testing preemption.') parser.add_argument('--max-num-batched-tokens', type=int, default=EngineArgs.max_num_batched_tokens, help='Maximum number of batched tokens per ' 'iteration.') parser.add_argument('--max-num-seqs', type=int, default=EngineArgs.max_num_seqs, help='Maximum number of sequences per iteration.') parser.add_argument( '--max-logprobs', type=int, default=EngineArgs.max_logprobs, help=('Max number of log probs to return logprobs is specified in' ' SamplingParams.')) parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') # Quantization settings. parser.add_argument('--quantization', '-q', type=nullable_str, choices=[*QUANTIZATION_METHODS, None], default=EngineArgs.quantization, help='Method used to quantize the weights. If ' 'None, we first check the `quantization_config` ' 'attribute in the model config file. If that is ' 'None, we assume the model weights are not ' 'quantized and use `dtype` to determine the data ' 'type of the weights.') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') parser.add_argument('--max-context-len-to-capture', type=int, default=EngineArgs.max_context_len_to_capture, help='Maximum context length covered by CUDA ' 'graphs. When a sequence has context length ' 'larger than this, we fall back to eager mode. ' '(DEPRECATED. Use --max-seq_len-to-capture instead' ')') parser.add_argument('--max-seq_len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, help='Maximum sequence length covered by CUDA ' 'graphs. When a sequence has context length ' 'larger than this, we fall back to eager mode.') parser.add_argument('--disable-custom-all-reduce', action='store_true', default=EngineArgs.disable_custom_all_reduce, help='See ParallelConfig.') parser.add_argument('--tokenizer-pool-size', type=int, default=EngineArgs.tokenizer_pool_size, help='Size of tokenizer pool to use for ' 'asynchronous tokenization. If 0, will ' 'use synchronous tokenization.') parser.add_argument('--tokenizer-pool-type', type=str, default=EngineArgs.tokenizer_pool_type, help='Type of tokenizer pool to use for ' 'asynchronous tokenization. Ignored ' 'if tokenizer_pool_size is 0.') parser.add_argument('--tokenizer-pool-extra-config', type=nullable_str, default=EngineArgs.tokenizer_pool_extra_config, help='Extra config for tokenizer pool. ' 'This should be a JSON string that will be ' 'parsed into a dictionary. Ignored if ' 'tokenizer_pool_size is 0.') # LoRA related configs parser.add_argument('--enable-lora', action='store_true', help='If True, enable handling of LoRA adapters.') parser.add_argument('--max-loras', type=int, default=EngineArgs.max_loras, help='Max number of LoRAs in a single batch.') parser.add_argument('--max-lora-rank', type=int, default=EngineArgs.max_lora_rank, help='Max LoRA rank.') parser.add_argument( '--lora-extra-vocab-size', type=int, default=EngineArgs.lora_extra_vocab_size, help=('Maximum size of extra vocabulary that can be ' 'present in a LoRA adapter (added to the base ' 'model vocabulary).')) parser.add_argument( '--lora-dtype', type=str, default=EngineArgs.lora_dtype, choices=['auto', 'float16', 'bfloat16', 'float32'], help=('Data type for LoRA. If auto, will default to ' 'base model dtype.')) parser.add_argument( '--max-cpu-loras', type=int, default=EngineArgs.max_cpu_loras, help=('Maximum number of LoRAs to store in CPU memory. ' 'Must be >= than max_num_seqs. ' 'Defaults to max_num_seqs.')) parser.add_argument( '--fully-sharded-loras', action='store_true', help=('By default, only half of the LoRA computation is ' 'sharded with tensor parallelism. ' 'Enabling this will use the fully sharded layers. ' 'At high sequence length, max rank or ' 'tensor parallel size, this is likely faster.')) parser.add_argument("--device", type=str, default=EngineArgs.device, choices=["auto", "cuda", "neuron", "cpu"], help='Device type for vLLM execution.') # Related to Vision-language models such as llava parser.add_argument( '--image-input-type', type=nullable_str, default=None, choices=[ t.name.lower() for t in VisionLanguageConfig.ImageInputType ], help=('The image input type passed into vLLM. ' 'Should be one of "pixel_values" or "image_features".')) parser.add_argument('--image-token-id', type=int, default=None, help=('Input id for image token.')) parser.add_argument( '--image-input-shape', type=nullable_str, default=None, help=('The biggest image input shape (worst for memory footprint) ' 'given an input type. Only used for vLLM\'s profile_run.')) parser.add_argument( '--image-feature-size', type=int, default=None, help=('The image feature size along the context dimension.')) parser.add_argument( '--scheduler-delay-factor', type=float, default=EngineArgs.scheduler_delay_factor, help='Apply a delay (of delay factor multiplied by previous' 'prompt latency) before scheduling next prompt.') parser.add_argument( '--enable-chunked-prefill', action='store_true', help='If set, the prefill requests can be chunked based on the ' 'max_num_batched_tokens.') parser.add_argument( '--speculative-model', type=nullable_str, default=EngineArgs.speculative_model, help= 'The name of the draft model to be used in speculative decoding.') parser.add_argument( '--num-speculative-tokens', type=int, default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') parser.add_argument( '--speculative-max-model-len', type=int, default=EngineArgs.speculative_max_model_len, help='The maximum sequence length supported by the ' 'draft model. Sequences over this length will skip ' 'speculation.') parser.add_argument( '--ngram-prompt-lookup-max', type=int, default=EngineArgs.ngram_prompt_lookup_max, help='Max size of window for ngram prompt lookup in speculative ' 'decoding.') parser.add_argument( '--ngram-prompt-lookup-min', type=int, default=EngineArgs.ngram_prompt_lookup_min, help='Min size of window for ngram prompt lookup in speculative ' 'decoding.') parser.add_argument('--model-loader-extra-config', type=nullable_str, default=EngineArgs.model_loader_extra_config, help='Extra config for model loader. ' 'This will be passed to the model loader ' 'corresponding to the chosen load_format. ' 'This should be a JSON string that will be ' 'parsed into a dictionary.') return parser @classmethod def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args def create_engine_config(self, ) -> EngineConfig: device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture, self.max_seq_len_to_capture, self.max_logprobs, self.skip_tokenizer_init) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, self.num_gpu_blocks_override, model_config.get_sliding_window(), self.enable_prefix_caching) parallel_config = ParallelConfig( self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, self.max_parallel_loading_workers, self.disable_custom_all_reduce, TokenizerPoolConfig.create_config( self.tokenizer_pool_size, self.tokenizer_pool_type, self.tokenizer_pool_extra_config, ), self.ray_workers_use_nsight) speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config, target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, num_speculative_tokens=self.num_speculative_tokens, speculative_max_model_len=self.speculative_max_model_len, enable_chunked_prefill=self.enable_chunked_prefill, use_v2_block_manager=self.use_v2_block_manager, ngram_prompt_lookup_max=self.ngram_prompt_lookup_max, ngram_prompt_lookup_min=self.ngram_prompt_lookup_min, ) scheduler_config = SchedulerConfig( self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len, self.use_v2_block_manager, num_lookahead_slots=(self.num_lookahead_slots if speculative_config is None else speculative_config.num_lookahead_slots), delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, ) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, fully_sharded_loras=self.fully_sharded_loras, lora_extra_vocab_size=self.lora_extra_vocab_size, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None) if self.enable_lora else None load_config = LoadConfig( load_format=self.load_format, download_dir=self.download_dir, model_loader_extra_config=self.model_loader_extra_config, ) if self.image_input_type: if (not self.image_token_id or not self.image_input_shape or not self.image_feature_size): raise ValueError( 'Specify `image_token_id`, `image_input_shape` and ' '`image_feature_size` together with `image_input_type`.') vision_language_config = VisionLanguageConfig( image_input_type=VisionLanguageConfig. get_image_input_enum_type(self.image_input_type), image_token_id=self.image_token_id, image_input_shape=str_to_int_tuple(self.image_input_shape), image_feature_size=self.image_feature_size, ) else: vision_language_config = None decoding_config = DecodingConfig( guided_decoding_backend=self.guided_decoding_backend) return EngineConfig(model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, lora_config=lora_config, vision_language_config=vision_language_config, speculative_config=speculative_config, load_config=load_config, decoding_config=decoding_config) @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous vLLM engine.""" engine_use_ray: bool = False disable_log_requests: bool = False max_log_len: Optional[int] = None @staticmethod def add_cli_args(parser: argparse.ArgumentParser, async_args_only: bool = False) -> argparse.ArgumentParser: if not async_args_only: parser = EngineArgs.add_cli_args(parser) parser.add_argument('--engine-use-ray', action='store_true', help='Use Ray to start the LLM engine in a ' 'separate process as the server process.') parser.add_argument('--disable-log-requests', action='store_true', help='Disable logging requests.') parser.add_argument('--max-log-len', type=int, default=None, help='Max number of prompt characters or prompt ' 'ID numbers being printed in log.' '\n\nDefault: Unlimited') return parser # These functions are used by sphinx to build the documentation def _engine_args_parser(): return EngineArgs.add_cli_args(argparse.ArgumentParser()) def _async_engine_args_parser(): return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(), async_args_only=True)