arg_utils.py 12.1 KB
Newer Older
1
import argparse
2
3
4
import dataclasses
from dataclasses import dataclass
from typing import Optional, Tuple
5

Woosuk Kwon's avatar
Woosuk Kwon committed
6
7
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
8
9


10
@dataclass
Zhuohan Li's avatar
Zhuohan Li committed
11
class EngineArgs:
Woosuk Kwon's avatar
Woosuk Kwon committed
12
    """Arguments for vLLM engine."""
13
    model: str
14
    tokenizer: Optional[str] = None
15
    tokenizer_mode: str = 'auto'
16
    trust_remote_code: bool = False
17
    download_dir: Optional[str] = None
18
    load_format: str = 'auto'
19
    dtype: str = 'auto'
20
    seed: int = 0
21
    max_model_len: Optional[int] = None
22
    worker_use_ray: bool = False
23
24
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
25
    max_parallel_loading_workers: Optional[int] = None
26
27
    block_size: int = 16
    swap_space: int = 4  # GiB
28
    gpu_memory_utilization: float = 0.90
29
    max_num_batched_tokens: Optional[int] = None
30
    max_num_seqs: int = 256
31
    max_paddings: int = 256
32
    disable_log_stats: bool = False
Jasmond L's avatar
Jasmond L committed
33
    revision: Optional[str] = None
34
    tokenizer_revision: Optional[str] = None
35
    quantization: Optional[str] = None
36
37
    enforce_eager: bool = False
    max_context_len_to_capture: int = 8192
38

39
    def __post_init__(self):
40
41
        if self.tokenizer is None:
            self.tokenizer = self.model
42
43
44

    @staticmethod
    def add_cli_args(
45
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
Woosuk Kwon's avatar
Woosuk Kwon committed
46
        """Shared CLI arguments for vLLM engine."""
47
48
49
50

        # NOTE: If you update any of the arguments below, please also
        # make sure to update docs/source/models/engine_args.rst

51
        # Model arguments
52
53
54
55
56
57
58
59
60
61
        parser.add_argument(
            '--model',
            type=str,
            default='facebook/opt-125m',
            help='name or path of the huggingface model to use')
        parser.add_argument(
            '--tokenizer',
            type=str,
            default=EngineArgs.tokenizer,
            help='name or path of the huggingface tokenizer to use')
Jasmond L's avatar
Jasmond L committed
62
63
64
65
66
67
68
        parser.add_argument(
            '--revision',
            type=str,
            default=None,
            help='the specific model version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
69
70
71
72
73
74
75
        parser.add_argument(
            '--tokenizer-revision',
            type=str,
            default=None,
            help='the specific tokenizer version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
76
77
        parser.add_argument('--tokenizer-mode',
                            type=str,
78
79
80
                            default=EngineArgs.tokenizer_mode,
                            choices=['auto', 'slow'],
                            help='tokenizer mode. "auto" will use the fast '
81
82
                            'tokenizer if available, and "slow" will '
                            'always use the slow tokenizer.')
83
84
85
        parser.add_argument('--trust-remote-code',
                            action='store_true',
                            help='trust remote code from huggingface')
86
87
        parser.add_argument('--download-dir',
                            type=str,
Zhuohan Li's avatar
Zhuohan Li committed
88
                            default=EngineArgs.download_dir,
89
                            help='directory to download and load the weights, '
90
91
                            'default to the default cache dir of '
                            'huggingface')
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
        parser.add_argument(
            '--load-format',
            type=str,
            default=EngineArgs.load_format,
            choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
            help='The format of the model weights to load. '
            '"auto" will try to load the weights in the safetensors format '
            'and fall back to the pytorch bin format if safetensors format '
            'is not available. '
            '"pt" will load the weights in the pytorch bin format. '
            '"safetensors" will load the weights in the safetensors format. '
            '"npcache" will load the weights in pytorch format and store '
            'a numpy cache to speed up the loading. '
            '"dummy" will initialize the weights with random values, '
            'which is mainly for profiling.')
107
108
109
110
        parser.add_argument(
            '--dtype',
            type=str,
            default=EngineArgs.dtype,
Woosuk Kwon's avatar
Woosuk Kwon committed
111
112
113
            choices=[
                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
            ],
114
115
116
117
            help='data type for model weights and activations. '
            'The "auto" option will use FP16 precision '
            'for FP32 and FP16 models, and BF16 precision '
            'for BF16 models.')
118
119
120
121
122
        parser.add_argument('--max-model-len',
                            type=int,
                            default=None,
                            help='model context length. If unspecified, '
                            'will be automatically derived from the model.')
123
        # Parallel arguments
124
125
        parser.add_argument('--worker-use-ray',
                            action='store_true',
126
                            help='use Ray for distributed serving, will be '
127
128
129
130
                            'automatically set when using more than 1 GPU')
        parser.add_argument('--pipeline-parallel-size',
                            '-pp',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
131
                            default=EngineArgs.pipeline_parallel_size,
132
                            help='number of pipeline stages')
133
134
135
        parser.add_argument('--tensor-parallel-size',
                            '-tp',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
136
                            default=EngineArgs.tensor_parallel_size,
137
                            help='number of tensor parallel replicas')
138
139
140
141
142
143
        parser.add_argument(
            '--max-parallel-loading-workers',
            type=int,
            help='load model sequentially in multiple batches, '
            'to avoid RAM OOM when using tensor '
            'parallel and large models')
144
        # KV cache arguments
145
146
        parser.add_argument('--block-size',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
147
                            default=EngineArgs.block_size,
Woosuk Kwon's avatar
Woosuk Kwon committed
148
                            choices=[8, 16, 32],
149
150
                            help='token block size')
        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
151
152
153
        parser.add_argument('--seed',
                            type=int,
                            default=EngineArgs.seed,
154
                            help='random seed')
155
156
        parser.add_argument('--swap-space',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
157
                            default=EngineArgs.swap_space,
158
                            help='CPU swap space size (GiB) per GPU')
159
160
161
162
163
164
165
        parser.add_argument(
            '--gpu-memory-utilization',
            type=float,
            default=EngineArgs.gpu_memory_utilization,
            help='the fraction of GPU memory to be used for '
            'the model executor, which can range from 0 to 1.'
            'If unspecified, will use the default value of 0.9.')
166
167
        parser.add_argument('--max-num-batched-tokens',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
168
                            default=EngineArgs.max_num_batched_tokens,
169
                            help='maximum number of batched tokens per '
170
171
172
                            'iteration')
        parser.add_argument('--max-num-seqs',
                            type=int,
Zhuohan Li's avatar
Zhuohan Li committed
173
                            default=EngineArgs.max_num_seqs,
174
                            help='maximum number of sequences per iteration')
175
176
177
178
        parser.add_argument('--max-paddings',
                            type=int,
                            default=EngineArgs.max_paddings,
                            help='maximum number of paddings in a batch')
179
180
        parser.add_argument('--disable-log-stats',
                            action='store_true',
181
                            help='disable logging statistics')
182
183
184
185
        # Quantization settings.
        parser.add_argument('--quantization',
                            '-q',
                            type=str,
CHU Tianxiang's avatar
CHU Tianxiang committed
186
                            choices=['awq', 'gptq', 'squeezellm', None],
187
                            default=None,
188
189
190
191
192
193
                            help='Method used to quantize the weights. If '
                            'None, we first check the `quantization_config` '
                            'attribute in the model config file. If that is '
                            'None, we assume the model weights are not '
                            'quantized and use `dtype` to determine the data '
                            'type of the weights.')
194
195
196
197
198
199
200
201
202
203
204
        parser.add_argument('--enforce-eager',
                            action='store_true',
                            help='Always use eager-mode PyTorch. If False, '
                            'will use eager mode and CUDA graph in hybrid '
                            'for maximal performance and flexibility.')
        parser.add_argument('--max-context-len-to-capture',
                            type=int,
                            default=EngineArgs.max_context_len_to_capture,
                            help='maximum context length covered by CUDA '
                            'graphs. When a sequence has context length '
                            'larger than this, we fall back to eager mode.')
205
        return parser
206
207

    @classmethod
208
    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
209
210
211
        # Get the list of attributes of this dataclass.
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        # Set the attributes from the parsed arguments.
Zhuohan Li's avatar
Zhuohan Li committed
212
213
        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
        return engine_args
214

Zhuohan Li's avatar
Zhuohan Li committed
215
    def create_engine_configs(
216
217
        self,
    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
218
        model_config = ModelConfig(self.model, self.tokenizer,
219
                                   self.tokenizer_mode, self.trust_remote_code,
220
                                   self.download_dir, self.load_format,
Jasmond L's avatar
Jasmond L committed
221
                                   self.dtype, self.seed, self.revision,
222
                                   self.tokenizer_revision, self.max_model_len,
223
224
                                   self.quantization, self.enforce_eager,
                                   self.max_context_len_to_capture)
225
226
227
228
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
                                   self.swap_space,
                                   model_config.get_sliding_window())
229
230
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
231
232
                                         self.worker_use_ray,
                                         self.max_parallel_loading_workers)
233
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
234
                                           self.max_num_seqs,
235
236
                                           model_config.max_model_len,
                                           self.max_paddings)
237
238
239
        return model_config, cache_config, parallel_config, scheduler_config


240
@dataclass
Zhuohan Li's avatar
Zhuohan Li committed
241
class AsyncEngineArgs(EngineArgs):
Woosuk Kwon's avatar
Woosuk Kwon committed
242
    """Arguments for asynchronous vLLM engine."""
Zhuohan Li's avatar
Zhuohan Li committed
243
    engine_use_ray: bool = False
244
    disable_log_requests: bool = False
245
    max_log_len: Optional[int] = None
246
247
248

    @staticmethod
    def add_cli_args(
249
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
Zhuohan Li's avatar
Zhuohan Li committed
250
        parser = EngineArgs.add_cli_args(parser)
251
252
        parser.add_argument('--engine-use-ray',
                            action='store_true',
Zhuohan Li's avatar
Zhuohan Li committed
253
                            help='use Ray to start the LLM engine in a '
254
255
256
                            'separate process as the server process.')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
257
                            help='disable logging requests')
258
259
260
261
262
263
        parser.add_argument('--max-log-len',
                            type=int,
                            default=None,
                            help='max number of prompt characters or prompt '
                            'ID numbers being printed in log. '
                            'Default: unlimited.')
264
        return parser