server_args.py 9.53 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
"""The arguments of the server."""

Lianmin Zheng's avatar
Lianmin Zheng committed
3
4
import argparse
import dataclasses
5
from typing import List, Optional, Union
Lianmin Zheng's avatar
Lianmin Zheng committed
6
7
8
9


@dataclasses.dataclass
class ServerArgs:
Lianmin Zheng's avatar
Lianmin Zheng committed
10
    # Model and tokenizer
Lianmin Zheng's avatar
Lianmin Zheng committed
11
12
13
14
    model_path: str
    tokenizer_path: Optional[str] = None
    load_format: str = "auto"
    tokenizer_mode: str = "auto"
Cody Yu's avatar
Cody Yu committed
15
    chat_template: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
16
    trust_remote_code: bool = True
Lianmin Zheng's avatar
Lianmin Zheng committed
17
    context_length: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
18
    quantization: Optional[str] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
19
20
21
22
23
24
25

    # Port
    host: str = "127.0.0.1"
    port: int = 30000
    additional_ports: Optional[Union[List[int], int]] = None

    # Memory and scheduling
Lianmin Zheng's avatar
Lianmin Zheng committed
26
    mem_fraction_static: Optional[float] = None
27
    max_prefill_num_token: Optional[int] = None
Lianmin Zheng's avatar
Lianmin Zheng committed
28
    schedule_heuristic: str = "lpm"
29
    schedule_conservativeness: float = 1.0
Lianmin Zheng's avatar
Lianmin Zheng committed
30
31
32

    # Other runtime options
    tp_size: int = 1
33
    stream_interval: int = 8
Lianmin Zheng's avatar
Lianmin Zheng committed
34
35
36
37
    random_seed: int = 42

    # Logging
    log_level: str = "info"
38
    log_requests: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
39
40
    disable_log_stats: bool = False
    log_stats_interval: int = 10
Liangsheng Yin's avatar
Liangsheng Yin committed
41
    show_time_cost: bool = False
Liangsheng Yin's avatar
Liangsheng Yin committed
42

Lianmin Zheng's avatar
Lianmin Zheng committed
43
44
45
46
    # Other
    api_key: str = ""

    # Optimization/debug options
Liangsheng Yin's avatar
Liangsheng Yin committed
47
    enable_flashinfer: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
48
49
    attention_reduce_in_fp32: bool = False
    disable_radix_cache: bool = False
50
51
    disable_regex_jump_forward: bool = False
    disable_disk_cache: bool = False
Lianmin Zheng's avatar
Lianmin Zheng committed
52
53
54
55

    def __post_init__(self):
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
Lianmin Zheng's avatar
Lianmin Zheng committed
56
        if self.mem_fraction_static is None:
Lianmin Zheng's avatar
Lianmin Zheng committed
57
58
59
60
61
62
            if self.tp_size >= 8:
                self.mem_fraction_static = 0.80
            elif self.tp_size >= 4:
                self.mem_fraction_static = 0.82
            elif self.tp_size >= 2:
                self.mem_fraction_static = 0.85
Lianmin Zheng's avatar
Lianmin Zheng committed
63
            else:
Lianmin Zheng's avatar
Lianmin Zheng committed
64
                self.mem_fraction_static = 0.90
65
66
67
68
        if isinstance(self.additional_ports, int):
            self.additional_ports = [self.additional_ports]
        elif self.additional_ports is None:
            self.additional_ports = []
Lianmin Zheng's avatar
Lianmin Zheng committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        parser.add_argument(
            "--model-path",
            type=str,
            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
            required=True,
        )
        parser.add_argument(
            "--tokenizer-path",
            type=str,
            default=ServerArgs.tokenizer_path,
            help="The path of the tokenizer.",
        )
Yuanhan Zhang's avatar
Yuanhan Zhang committed
84
85
86
87
88
89
        parser.add_argument(
            "--host", type=str, default=ServerArgs.host, help="The host of the server."
        )
        parser.add_argument(
            "--port", type=int, default=ServerArgs.port, help="The port of the server."
        )
90
91
92
93
94
        parser.add_argument(
            "--additional-ports",
            type=int,
            nargs="*",
            default=[],
Lianmin Zheng's avatar
Lianmin Zheng committed
95
            help="Additional ports specified for the server.",
96
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
        parser.add_argument(
            "--load-format",
            type=str,
            default=ServerArgs.load_format,
            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
            help="The format of the model weights to load. "
            '"auto" will try to load the weights in the safetensors format '
            "and fall back to the pytorch bin format if safetensors format "
            "is not available. "
            '"pt" will load the weights in the pytorch bin format. '
            '"safetensors" will load the weights in the safetensors format. '
            '"npcache" will load the weights in pytorch format and store '
            "a numpy cache to speed up the loading. "
            '"dummy" will initialize the weights with random values, '
            "which is mainly for profiling.",
        )
        parser.add_argument(
            "--tokenizer-mode",
            type=str,
            default=ServerArgs.tokenizer_mode,
            choices=["auto", "slow"],
            help="Tokenizer mode. 'auto' will use the fast "
            "tokenizer if available, and 'slow' will "
            "always use the slow tokenizer.",
        )
Cody Yu's avatar
Cody Yu committed
122
123
124
125
126
127
        parser.add_argument(
            "--chat-template",
            type=str,
            default=ServerArgs.chat_template,
            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
128
129
130
131
132
        parser.add_argument(
            "--trust-remote-code",
            action="store_true",
            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
133
134
135
136
137
138
        parser.add_argument(
            "--context-length",
            type=int,
            default=ServerArgs.context_length,
            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
139
140
141
142
143
144
        parser.add_argument(
            "--quantization",
            type=str,
            default=ServerArgs.quantization,
            help="The quantization method.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
145
146
147
148
        parser.add_argument(
            "--mem-fraction-static",
            type=float,
            default=ServerArgs.mem_fraction_static,
149
            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
Lianmin Zheng's avatar
Lianmin Zheng committed
150
        )
151
152
153
154
        parser.add_argument(
            "--max-prefill-num-token",
            type=int,
            default=ServerArgs.max_prefill_num_token,
155
            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
156
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
157
158
159
160
        parser.add_argument(
            "--schedule-heuristic",
            type=str,
            default=ServerArgs.schedule_heuristic,
Liangsheng Yin's avatar
Liangsheng Yin committed
161
162
            choices=["lpm", "random", "fcfs", "dfs-weight"],
            help="Scheduling Heuristic.",
Lianmin Zheng's avatar
Lianmin Zheng committed
163
        )
164
165
166
167
        parser.add_argument(
            "--schedule-conservativeness",
            type=float,
            default=ServerArgs.schedule_conservativeness,
168
            help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
169
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
170
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
171
            "--tp-size",
Lianmin Zheng's avatar
Lianmin Zheng committed
172
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
173
174
            default=ServerArgs.tp_size,
            help="Tensor parallelism size.",
175
        )
176
177
178
        parser.add_argument(
            "--stream-interval",
            type=int,
Lianmin Zheng's avatar
Lianmin Zheng committed
179
            default=ServerArgs.stream_interval,
180
            help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
181
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
182
183
184
185
186
187
        parser.add_argument(
            "--random-seed",
            type=int,
            default=ServerArgs.random_seed,
            help="Random seed.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
188
189
190
191
        parser.add_argument(
            "--log-level",
            type=str,
            default=ServerArgs.log_level,
Lianmin Zheng's avatar
Lianmin Zheng committed
192
            help="Logging level",
Lianmin Zheng's avatar
Lianmin Zheng committed
193
        )
194
195
196
197
198
        parser.add_argument(
            "--log-requests",
            action="store_true",
            help="Log all requests",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
199
200
201
202
203
204
205
206
207
208
209
        parser.add_argument(
            "--disable-log-stats",
            action="store_true",
            help="Disable logging throughput stats.",
        )
        parser.add_argument(
            "--log-stats-interval",
            type=int,
            default=ServerArgs.log_stats_interval,
            help="Log stats interval in second.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
210
211
212
213
214
        parser.add_argument(
            "--show-time-cost",
            action="store_true",
            help="Show time cost of custom marks",
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
215
216
217
218
        parser.add_argument(
            "--api-key",
            type=str,
            default=ServerArgs.api_key,
Lianmin Zheng's avatar
Lianmin Zheng committed
219
            help="Set API key of the server",
Liangsheng Yin's avatar
Liangsheng Yin committed
220
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
221
222

        # Optimization/debug options
Liangsheng Yin's avatar
Liangsheng Yin committed
223
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
224
            "--enable-flashinfer",
Liangsheng Yin's avatar
Liangsheng Yin committed
225
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
226
            help="Enable flashinfer inference kernels",
Liangsheng Yin's avatar
Liangsheng Yin committed
227
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
228
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
229
            "--attention-reduce-in-fp32",
Liangsheng Yin's avatar
Liangsheng Yin committed
230
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
231
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
Liangsheng Yin's avatar
Liangsheng Yin committed
232
233
        )
        parser.add_argument(
Lianmin Zheng's avatar
Lianmin Zheng committed
234
            "--disable-radix-cache",
Liangsheng Yin's avatar
Liangsheng Yin committed
235
            action="store_true",
Lianmin Zheng's avatar
Lianmin Zheng committed
236
            help="Disable RadixAttention",
Liangsheng Yin's avatar
Liangsheng Yin committed
237
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
238
        parser.add_argument(
239
            "--disable-regex-jump-forward",
Liangsheng Yin's avatar
Liangsheng Yin committed
240
            action="store_true",
Liangsheng Yin's avatar
Liangsheng Yin committed
241
            help="Disable regex jump-forward",
Liangsheng Yin's avatar
Liangsheng Yin committed
242
        )
243
244
245
246
247
        parser.add_argument(
            "--disable-disk-cache",
            action="store_true",
            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
248
249
250
251
252
253
254
255
256

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        return cls(**{attr: getattr(args, attr) for attr in attrs})

    def url(self):
        return f"http://{self.host}:{self.port}"

Lianmin Zheng's avatar
Lianmin Zheng committed
257
    def print_mode_args(self):
Liangsheng Yin's avatar
Liangsheng Yin committed
258
259
        return (
            f"enable_flashinfer={self.enable_flashinfer}, "
260
            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
Lianmin Zheng's avatar
Lianmin Zheng committed
261
            f"disable_radix_cache={self.disable_radix_cache}, "
Liangsheng Yin's avatar
Liangsheng Yin committed
262
263
264
265
            f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
            f"disable_disk_cache={self.disable_disk_cache}, "
        )

Lianmin Zheng's avatar
Lianmin Zheng committed
266
267
268
269
270
271
272

@dataclasses.dataclass
class PortArgs:
    tokenizer_port: int
    router_port: int
    detokenizer_port: int
    nccl_port: int
Yuanhan Zhang's avatar
Yuanhan Zhang committed
273
    model_rpc_ports: List[int]