server_args.py 13.5 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
"""The arguments of the server."""

Lianmin Zheng's avatar
Lianmin Zheng committed
3
4
import argparse
import dataclasses
5
import random
6
from typing import List, Optional, Union
Lianmin Zheng's avatar
Lianmin Zheng committed
7
8
9
10


@dataclasses.dataclass
class ServerArgs:
Lianmin Zheng's avatar
Lianmin Zheng committed
11
    # Model and tokenizer
Lianmin Zheng's avatar
Lianmin Zheng committed
12
13
14
    model_path: str
    tokenizer_path: Optional[str] = None
    tokenizer_mode: str = "auto"
Lianmin Zheng's avatar
Lianmin Zheng committed
15
16
    load_format: str = "auto"
    dtype: str = "auto"
Lianmin Zheng's avatar
Lianmin Zheng committed
17
    trust_remote_code: bool = True
Lianmin Zheng's avatar
Lianmin Zheng committed
18
    context_length: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
19
    quantization: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
20
    chat_template: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
21
22
23
24
25
26
27

    # Port
    host: str = "127.0.0.1"
    port: int = 30000
    additional_ports: Optional[Union[List[int], int]] = None

    # Memory and scheduling
Lianmin Zheng's avatar
Lianmin Zheng committed
28
    mem_fraction_static: Optional[float] = None
29
30
    max_prefill_tokens: Optional[int] = None
    max_running_requests: Optional[int] = None
Liangsheng Yin's avatar
Liangsheng Yin committed
31
    max_num_reqs: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
32
    schedule_heuristic: str = "lpm"
33
    schedule_conservativeness: float = 1.0
Lianmin Zheng's avatar
Lianmin Zheng committed
34
35
36

    # Other runtime options
    tp_size: int = 1
37
    stream_interval: int = 1
38
    random_seed: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
39
40
41

    # Logging
    log_level: str = "info"
42
    log_level_http: Optional[str] = None
43
    log_requests: bool = False
Liangsheng Yin's avatar
Liangsheng Yin committed
44
    show_time_cost: bool = False
Liangsheng Yin's avatar
Liangsheng Yin committed
45

Lianmin Zheng's avatar
Lianmin Zheng committed
46
47
48
    # Other
    api_key: str = ""

49
50
51
52
    # Data parallelism
    dp_size: int = 1
    load_balance_method: str = "round_robin"

Lianmin Zheng's avatar
Lianmin Zheng committed
53
    # Optimization/debug options
54
    disable_flashinfer: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
55
    disable_radix_cache: bool = False
56
    disable_regex_jump_forward: bool = False
57
    disable_cuda_graph: bool = False
58
    disable_disk_cache: bool = False
59
    enable_torch_compile: bool = False
60
    attention_reduce_in_fp32: bool = False
61
    enable_p2p_check: bool = False
62
    efficient_weight_load: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
63

64
65
66
67
68
    # Distributed args
    nccl_init_addr: Optional[str] = None
    nnodes: int = 1
    node_rank: Optional[int] = None

Lianmin Zheng's avatar
Lianmin Zheng committed
69
70
71
    def __post_init__(self):
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
Lianmin Zheng's avatar
Lianmin Zheng committed
72
        if self.mem_fraction_static is None:
73
            if self.tp_size >= 16:
Ying Sheng's avatar
Ying Sheng committed
74
                self.mem_fraction_static = 0.80
75
            elif self.tp_size >= 8:
Ying Sheng's avatar
Ying Sheng committed
76
                self.mem_fraction_static = 0.84
Lianmin Zheng's avatar
Lianmin Zheng committed
77
            elif self.tp_size >= 4:
Ying Sheng's avatar
Ying Sheng committed
78
                self.mem_fraction_static = 0.86
Lianmin Zheng's avatar
Lianmin Zheng committed
79
            elif self.tp_size >= 2:
80
                self.mem_fraction_static = 0.88
Ying Sheng's avatar
Ying Sheng committed
81
82
            else:
                self.mem_fraction_static = 0.89
83
84
85
86
        if isinstance(self.additional_ports, int):
            self.additional_ports = [self.additional_ports]
        elif self.additional_ports is None:
            self.additional_ports = []
Lianmin Zheng's avatar
Lianmin Zheng committed
87

88
89
90
        if self.random_seed is None:
            self.random_seed = random.randint(0, 1 << 30)

Lianmin Zheng's avatar
Lianmin Zheng committed
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        parser.add_argument(
            "--model-path",
            type=str,
            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
            required=True,
        )
        parser.add_argument(
            "--tokenizer-path",
            type=str,
            default=ServerArgs.tokenizer_path,
            help="The path of the tokenizer.",
        )
Yuanhan Zhang's avatar
Yuanhan Zhang committed
105
106
107
108
109
110
        parser.add_argument(
            "--host", type=str, default=ServerArgs.host, help="The host of the server."
        )
        parser.add_argument(
            "--port", type=int, default=ServerArgs.port, help="The port of the server."
        )
111
112
113
114
115
        parser.add_argument(
            "--additional-ports",
            type=int,
            nargs="*",
            default=[],
116
            help="The additional ports specified for the server.",
117
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
118
119
120
121
122
123
124
125
126
        parser.add_argument(
            "--tokenizer-mode",
            type=str,
            default=ServerArgs.tokenizer_mode,
            choices=["auto", "slow"],
            help="Tokenizer mode. 'auto' will use the fast "
            "tokenizer if available, and 'slow' will "
            "always use the slow tokenizer.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        parser.add_argument(
            "--load-format",
            type=str,
            default=ServerArgs.load_format,
            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
            help="The format of the model weights to load. "
            '"auto" will try to load the weights in the safetensors format '
            "and fall back to the pytorch bin format if safetensors format "
            "is not available. "
            '"pt" will load the weights in the pytorch bin format. '
            '"safetensors" will load the weights in the safetensors format. '
            '"npcache" will load the weights in pytorch format and store '
            "a numpy cache to speed up the loading. "
            '"dummy" will initialize the weights with random values, '
            "which is mainly for profiling.",
        )
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
144
            "--dtype",
Cody Yu's avatar
Cody Yu committed
145
            type=str,
Lianmin Zheng's avatar
Lianmin Zheng committed
146
            default=ServerArgs.dtype,
Ying Sheng's avatar
Ying Sheng committed
147
148
            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
            help="Data type for model weights and activations.\n\n"
Lianmin Zheng's avatar
Lianmin Zheng committed
149
            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
Ying Sheng's avatar
Ying Sheng committed
150
            "BF16 precision for BF16 models.\n"
Lianmin Zheng's avatar
Lianmin Zheng committed
151
152
153
154
            '* "half" for FP16. Recommended for AWQ quantization.\n'
            '* "float16" is the same as "half".\n'
            '* "bfloat16" for a balance between precision and range.\n'
            '* "float" is shorthand for FP32 precision.\n'
Ying Sheng's avatar
Ying Sheng committed
155
156
            '* "float32" for FP32 precision.',
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
157
158
159
160
161
        parser.add_argument(
            "--trust-remote-code",
            action="store_true",
            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
162
163
164
165
166
167
        parser.add_argument(
            "--context-length",
            type=int,
            default=ServerArgs.context_length,
            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
168
169
170
171
        parser.add_argument(
            "--quantization",
            type=str,
            default=ServerArgs.quantization,
Ying Sheng's avatar
Ying Sheng committed
172
173
174
175
176
177
178
179
180
            choices=[
                "awq",
                "fp8",
                "gptq",
                "marlin",
                "gptq_marlin",
                "squeezellm",
                "bitsandbytes",
            ],
Lianmin Zheng's avatar
Lianmin Zheng committed
181
182
            help="The quantization method.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
183
184
185
186
187
188
        parser.add_argument(
            "--chat-template",
            type=str,
            default=ServerArgs.chat_template,
            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
189
190
191
192
        parser.add_argument(
            "--mem-fraction-static",
            type=float,
            default=ServerArgs.mem_fraction_static,
193
            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
Lianmin Zheng's avatar
Lianmin Zheng committed
194
        )
195
        parser.add_argument(
196
            "--max-prefill-tokens",
197
            type=int,
198
            default=ServerArgs.max_prefill_tokens,
199
            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
200
        )
201
202
203
204
205
206
        parser.add_argument(
            "--max-running-requests",
            type=int,
            default=ServerArgs.max_running_requests,
            help="The maximum number of running requests.",
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
207
208
209
210
211
212
        parser.add_argument(
            "--max-num-reqs",
            type=int,
            default=None,
            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
213
214
215
216
        parser.add_argument(
            "--schedule-heuristic",
            type=str,
            default=ServerArgs.schedule_heuristic,
Liangsheng Yin's avatar
Liangsheng Yin committed
217
            choices=["lpm", "random", "fcfs", "dfs-weight"],
218
            help="The scheduling heuristic.",
Lianmin Zheng's avatar
Lianmin Zheng committed
219
        )
220
221
222
223
        parser.add_argument(
            "--schedule-conservativeness",
            type=float,
            default=ServerArgs.schedule_conservativeness,
224
            help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
225
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
226
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
227
            "--tp-size",
Lianmin Zheng's avatar
Lianmin Zheng committed
228
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
229
            default=ServerArgs.tp_size,
230
            help="The tensor parallelism size.",
231
        )
232
233
234
        parser.add_argument(
            "--stream-interval",
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
235
            default=ServerArgs.stream_interval,
236
            help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
237
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
238
239
240
241
        parser.add_argument(
            "--random-seed",
            type=int,
            default=ServerArgs.random_seed,
242
            help="The random seed.",
Lianmin Zheng's avatar
Lianmin Zheng committed
243
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
244
245
246
247
        parser.add_argument(
            "--log-level",
            type=str,
            default=ServerArgs.log_level,
248
            help="The logging level of all loggers.",
Lianmin Zheng's avatar
Lianmin Zheng committed
249
        )
250
        parser.add_argument(
251
252
253
254
            "--log-level-http",
            type=str,
            default=ServerArgs.log_level_http,
            help="The logging level of HTTP server. If not set, reuse --log-level by default.",
255
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
256
        parser.add_argument(
257
            "--log-requests",
Lianmin Zheng's avatar
Lianmin Zheng committed
258
            action="store_true",
259
            help="Log the inputs and outputs of all requests.",
Lianmin Zheng's avatar
Lianmin Zheng committed
260
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
261
262
263
        parser.add_argument(
            "--show-time-cost",
            action="store_true",
Ying Sheng's avatar
Ying Sheng committed
264
            help="Show time cost of custom marks.",
Lianmin Zheng's avatar
Lianmin Zheng committed
265
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
266
267
268
269
        parser.add_argument(
            "--api-key",
            type=str,
            default=ServerArgs.api_key,
Ying Sheng's avatar
Ying Sheng committed
270
            help="Set API key of the server.",
Liangsheng Yin's avatar
Liangsheng Yin committed
271
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
272

273
274
275
276
277
        # Data parallelism
        parser.add_argument(
            "--dp-size",
            type=int,
            default=ServerArgs.dp_size,
278
            help="The data parallelism size.",
279
280
281
282
283
        )
        parser.add_argument(
            "--load-balance-method",
            type=str,
            default=ServerArgs.load_balance_method,
284
            help="The load balancing strategy for data parallelism.",
285
286
287
288
289
290
            choices=[
                "round_robin",
                "shortest_queue",
            ],
        )

291
292
293
294
        # Multi-node distributed serving args
        parser.add_argument(
            "--nccl-init-addr",
            type=str,
Ying Sheng's avatar
Ying Sheng committed
295
            help="The nccl init address of multi-node server.",
296
297
        )
        parser.add_argument(
Ying Sheng's avatar
Ying Sheng committed
298
            "--nnodes", type=int, default=1, help="The number of nodes."
299
        )
Ying Sheng's avatar
Ying Sheng committed
300
        parser.add_argument("--node-rank", type=int, help="The node rank.")
301

Lianmin Zheng's avatar
Lianmin Zheng committed
302
        # Optimization/debug options
Liangsheng Yin's avatar
Liangsheng Yin committed
303
        parser.add_argument(
304
            "--disable-flashinfer",
Liangsheng Yin's avatar
Liangsheng Yin committed
305
            action="store_true",
Ying Sheng's avatar
Ying Sheng committed
306
            help="Disable flashinfer inference kernels.",
Liangsheng Yin's avatar
Liangsheng Yin committed
307
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
308
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
309
            "--disable-radix-cache",
Liangsheng Yin's avatar
Liangsheng Yin committed
310
            action="store_true",
Ying Sheng's avatar
Ying Sheng committed
311
            help="Disable RadixAttention for prefix caching.",
Liangsheng Yin's avatar
Liangsheng Yin committed
312
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
313
        parser.add_argument(
314
            "--disable-regex-jump-forward",
Liangsheng Yin's avatar
Liangsheng Yin committed
315
            action="store_true",
Ying Sheng's avatar
Ying Sheng committed
316
            help="Disable regex jump-forward.",
Liangsheng Yin's avatar
Liangsheng Yin committed
317
        )
318
319
320
321
322
        parser.add_argument(
            "--disable-cuda-graph",
            action="store_true",
            help="Disable cuda graph.",
        )
323
324
325
326
327
        parser.add_argument(
            "--disable-disk-cache",
            action="store_true",
            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
        )
328
329
330
331
332
        parser.add_argument(
            "--enable-torch-compile",
            action="store_true",
            help="Optimize the model with torch.compile, experimental feature.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
333
334
335
336
337
338
        parser.add_argument(
            "--attention-reduce-in-fp32",
            action="store_true",
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
            "This only affects Triton attention kernels",
        )
339
340
341
342
343
        parser.add_argument(
            "--enable-p2p-check",
            action="store_true",
            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
        )
344
345
346
347
348
        parser.add_argument(
            "--efficient-weight-load",
            action="store_true",
            help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
349
350
351
352
353
354
355
356
357

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        return cls(**{attr: getattr(args, attr) for attr in attrs})

    def url(self):
        return f"http://{self.host}:{self.port}"

Lianmin Zheng's avatar
Lianmin Zheng committed
358
    def print_mode_args(self):
Liangsheng Yin's avatar
Liangsheng Yin committed
359
        return (
360
            f"disable_flashinfer={self.disable_flashinfer}, "
361
            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
Lianmin Zheng's avatar
Lianmin Zheng committed
362
            f"disable_radix_cache={self.disable_radix_cache}, "
Liangsheng Yin's avatar
Liangsheng Yin committed
363
364
365
366
            f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
            f"disable_disk_cache={self.disable_disk_cache}, "
        )

367
368
369
370
371
372
373
374
    def check_server_args(self):
        assert (
            self.tp_size % self.nnodes == 0
        ), "tp_size must be divisible by number of nodes"
        assert not (
            self.dp_size > 1 and self.node_rank is not None
        ), "multi-node data parallel is not supported"

Lianmin Zheng's avatar
Lianmin Zheng committed
375
376
377
378

@dataclasses.dataclass
class PortArgs:
    tokenizer_port: int
Mingyi's avatar
Mingyi committed
379
    controller_port: int
Lianmin Zheng's avatar
Lianmin Zheng committed
380
    detokenizer_port: int
Mingyi's avatar
Mingyi committed
381
    nccl_ports: List[int]