server_args.py 10.6 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
"""The arguments of the server."""

Lianmin Zheng's avatar
Lianmin Zheng committed
3
4
import argparse
import dataclasses
5
import random
6
from typing import List, Optional, Union
Lianmin Zheng's avatar
Lianmin Zheng committed
7
8
9
10


@dataclasses.dataclass
class ServerArgs:
Lianmin Zheng's avatar
Lianmin Zheng committed
11
    # Model and tokenizer
Lianmin Zheng's avatar
Lianmin Zheng committed
12
13
14
15
    model_path: str
    tokenizer_path: Optional[str] = None
    load_format: str = "auto"
    tokenizer_mode: str = "auto"
Cody Yu's avatar
Cody Yu committed
16
    chat_template: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
17
    trust_remote_code: bool = True
Lianmin Zheng's avatar
Lianmin Zheng committed
18
    context_length: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
19
    quantization: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
20
21
22
23
24
25
26

    # Port
    host: str = "127.0.0.1"
    port: int = 30000
    additional_ports: Optional[Union[List[int], int]] = None

    # Memory and scheduling
Lianmin Zheng's avatar
Lianmin Zheng committed
27
    mem_fraction_static: Optional[float] = None
28
29
    max_prefill_tokens: Optional[int] = None
    max_running_requests: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
30
    schedule_heuristic: str = "lpm"
31
    schedule_conservativeness: float = 1.0
Lianmin Zheng's avatar
Lianmin Zheng committed
32
33
34

    # Other runtime options
    tp_size: int = 1
35
    stream_interval: int = 8
36
    random_seed: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
37
38
39

    # Logging
    log_level: str = "info"
40
    log_requests: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
41
42
    disable_log_stats: bool = False
    log_stats_interval: int = 10
Liangsheng Yin's avatar
Liangsheng Yin committed
43
    show_time_cost: bool = False
Liangsheng Yin's avatar
Liangsheng Yin committed
44

Lianmin Zheng's avatar
Lianmin Zheng committed
45
46
47
    # Other
    api_key: str = ""

48
49
50
51
    # Data parallelism
    dp_size: int = 1
    load_balance_method: str = "round_robin"

Lianmin Zheng's avatar
Lianmin Zheng committed
52
    # Optimization/debug options
Liangsheng Yin's avatar
Liangsheng Yin committed
53
    enable_flashinfer: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
54
55
    attention_reduce_in_fp32: bool = False
    disable_radix_cache: bool = False
56
57
    disable_regex_jump_forward: bool = False
    disable_disk_cache: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
58
59
60
61

    def __post_init__(self):
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
Lianmin Zheng's avatar
Lianmin Zheng committed
62
        if self.mem_fraction_static is None:
Lianmin Zheng's avatar
Lianmin Zheng committed
63
64
65
66
67
68
            if self.tp_size >= 8:
                self.mem_fraction_static = 0.80
            elif self.tp_size >= 4:
                self.mem_fraction_static = 0.82
            elif self.tp_size >= 2:
                self.mem_fraction_static = 0.85
Lianmin Zheng's avatar
Lianmin Zheng committed
69
            else:
Lianmin Zheng's avatar
Lianmin Zheng committed
70
                self.mem_fraction_static = 0.90
71
72
73
74
        if isinstance(self.additional_ports, int):
            self.additional_ports = [self.additional_ports]
        elif self.additional_ports is None:
            self.additional_ports = []
Lianmin Zheng's avatar
Lianmin Zheng committed
75

76
77
78
        if self.random_seed is None:
            self.random_seed = random.randint(0, 1 << 30)

Lianmin Zheng's avatar
Lianmin Zheng committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        parser.add_argument(
            "--model-path",
            type=str,
            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
            required=True,
        )
        parser.add_argument(
            "--tokenizer-path",
            type=str,
            default=ServerArgs.tokenizer_path,
            help="The path of the tokenizer.",
        )
Yuanhan Zhang's avatar
Yuanhan Zhang committed
93
94
95
96
97
98
        parser.add_argument(
            "--host", type=str, default=ServerArgs.host, help="The host of the server."
        )
        parser.add_argument(
            "--port", type=int, default=ServerArgs.port, help="The port of the server."
        )
99
100
101
102
103
        parser.add_argument(
            "--additional-ports",
            type=int,
            nargs="*",
            default=[],
Lianmin Zheng's avatar
Lianmin Zheng committed
104
            help="Additional ports specified for the server.",
105
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        parser.add_argument(
            "--load-format",
            type=str,
            default=ServerArgs.load_format,
            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
            help="The format of the model weights to load. "
            '"auto" will try to load the weights in the safetensors format '
            "and fall back to the pytorch bin format if safetensors format "
            "is not available. "
            '"pt" will load the weights in the pytorch bin format. '
            '"safetensors" will load the weights in the safetensors format. '
            '"npcache" will load the weights in pytorch format and store '
            "a numpy cache to speed up the loading. "
            '"dummy" will initialize the weights with random values, '
            "which is mainly for profiling.",
        )
        parser.add_argument(
            "--tokenizer-mode",
            type=str,
            default=ServerArgs.tokenizer_mode,
            choices=["auto", "slow"],
            help="Tokenizer mode. 'auto' will use the fast "
            "tokenizer if available, and 'slow' will "
            "always use the slow tokenizer.",
        )
Cody Yu's avatar
Cody Yu committed
131
132
133
134
135
136
        parser.add_argument(
            "--chat-template",
            type=str,
            default=ServerArgs.chat_template,
            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
137
138
139
140
141
        parser.add_argument(
            "--trust-remote-code",
            action="store_true",
            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
142
143
144
145
146
147
        parser.add_argument(
            "--context-length",
            type=int,
            default=ServerArgs.context_length,
            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
148
149
150
151
152
153
        parser.add_argument(
            "--quantization",
            type=str,
            default=ServerArgs.quantization,
            help="The quantization method.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
154
155
156
157
        parser.add_argument(
            "--mem-fraction-static",
            type=float,
            default=ServerArgs.mem_fraction_static,
158
            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
Lianmin Zheng's avatar
Lianmin Zheng committed
159
        )
160
        parser.add_argument(
161
            "--max-prefill-tokens",
162
            type=int,
163
            default=ServerArgs.max_prefill_tokens,
164
            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
165
        )
166
167
168
169
170
171
        parser.add_argument(
            "--max-running-requests",
            type=int,
            default=ServerArgs.max_running_requests,
            help="The maximum number of running requests.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
172
173
174
175
        parser.add_argument(
            "--schedule-heuristic",
            type=str,
            default=ServerArgs.schedule_heuristic,
Liangsheng Yin's avatar
Liangsheng Yin committed
176
177
            choices=["lpm", "random", "fcfs", "dfs-weight"],
            help="Scheduling Heuristic.",
Lianmin Zheng's avatar
Lianmin Zheng committed
178
        )
179
180
181
182
        parser.add_argument(
            "--schedule-conservativeness",
            type=float,
            default=ServerArgs.schedule_conservativeness,
183
            help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
184
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
185
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
186
            "--tp-size",
Lianmin Zheng's avatar
Lianmin Zheng committed
187
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
188
189
            default=ServerArgs.tp_size,
            help="Tensor parallelism size.",
190
        )
191
192
193
        parser.add_argument(
            "--stream-interval",
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
194
            default=ServerArgs.stream_interval,
195
            help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
196
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
197
198
199
200
        parser.add_argument(
            "--random-seed",
            type=int,
            default=ServerArgs.random_seed,
201
            help="Random seed.",
Lianmin Zheng's avatar
Lianmin Zheng committed
202
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
203
204
205
206
        parser.add_argument(
            "--log-level",
            type=str,
            default=ServerArgs.log_level,
Lianmin Zheng's avatar
Lianmin Zheng committed
207
            help="Logging level",
Lianmin Zheng's avatar
Lianmin Zheng committed
208
        )
209
210
211
212
213
        parser.add_argument(
            "--log-requests",
            action="store_true",
            help="Log all requests",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
214
215
216
217
218
219
220
221
222
223
224
        parser.add_argument(
            "--disable-log-stats",
            action="store_true",
            help="Disable logging throughput stats.",
        )
        parser.add_argument(
            "--log-stats-interval",
            type=int,
            default=ServerArgs.log_stats_interval,
            help="Log stats interval in second.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
225
226
227
228
229
        parser.add_argument(
            "--show-time-cost",
            action="store_true",
            help="Show time cost of custom marks",
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
230
231
232
233
        parser.add_argument(
            "--api-key",
            type=str,
            default=ServerArgs.api_key,
Lianmin Zheng's avatar
Lianmin Zheng committed
234
            help="Set API key of the server",
Liangsheng Yin's avatar
Liangsheng Yin committed
235
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
236

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
        # Data parallelism
        parser.add_argument(
            "--dp-size",
            type=int,
            default=ServerArgs.dp_size,
            help="Data parallelism size.",
        )
        parser.add_argument(
            "--load-balance-method",
            type=str,
            default=ServerArgs.load_balance_method,
            help="Load balancing strategy for data parallelism.",
            choices=[
                "round_robin",
                "shortest_queue",
            ],
        )

Lianmin Zheng's avatar
Lianmin Zheng committed
255
        # Optimization/debug options
Liangsheng Yin's avatar
Liangsheng Yin committed
256
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
257
            "--enable-flashinfer",
Liangsheng Yin's avatar
Liangsheng Yin committed
258
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
259
            help="Enable flashinfer inference kernels",
Liangsheng Yin's avatar
Liangsheng Yin committed
260
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
261
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
262
            "--attention-reduce-in-fp32",
Liangsheng Yin's avatar
Liangsheng Yin committed
263
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
264
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
Liangsheng Yin's avatar
Liangsheng Yin committed
265
266
        )
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
267
            "--disable-radix-cache",
Liangsheng Yin's avatar
Liangsheng Yin committed
268
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
269
            help="Disable RadixAttention",
Liangsheng Yin's avatar
Liangsheng Yin committed
270
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
271
        parser.add_argument(
272
            "--disable-regex-jump-forward",
Liangsheng Yin's avatar
Liangsheng Yin committed
273
            action="store_true",
Liangsheng Yin's avatar
Liangsheng Yin committed
274
            help="Disable regex jump-forward",
Liangsheng Yin's avatar
Liangsheng Yin committed
275
        )
276
277
278
279
280
        parser.add_argument(
            "--disable-disk-cache",
            action="store_true",
            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
281
282
283
284
285
286
287
288
289

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        return cls(**{attr: getattr(args, attr) for attr in attrs})

    def url(self):
        return f"http://{self.host}:{self.port}"

Lianmin Zheng's avatar
Lianmin Zheng committed
290
    def print_mode_args(self):
Liangsheng Yin's avatar
Liangsheng Yin committed
291
292
        return (
            f"enable_flashinfer={self.enable_flashinfer}, "
293
            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
Lianmin Zheng's avatar
Lianmin Zheng committed
294
            f"disable_radix_cache={self.disable_radix_cache}, "
Liangsheng Yin's avatar
Liangsheng Yin committed
295
296
297
298
            f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
            f"disable_disk_cache={self.disable_disk_cache}, "
        )

Lianmin Zheng's avatar
Lianmin Zheng committed
299

300
301
302
303
304
305
@dataclasses.dataclass
class ModelPortArgs:
    nccl_port: int
    model_tp_ports: List[int]


Lianmin Zheng's avatar
Lianmin Zheng committed
306
307
308
309
310
@dataclasses.dataclass
class PortArgs:
    tokenizer_port: int
    router_port: int
    detokenizer_port: int
311
    model_port_args: List[ModelPortArgs]