help="Additional ports specified for launching server.",
help="Additional ports specified for the server.",
)
parser.add_argument(
"--load-format",
...
...
@@ -112,6 +126,12 @@ class ServerArgs:
action="store_true",
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
)
parser.add_argument(
"--context-length",
type=int,
default=ServerArgs.context_length,
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
)
parser.add_argument(
"--mem-fraction-static",
type=float,
...
...
@@ -124,18 +144,6 @@ class ServerArgs:
default=ServerArgs.max_prefill_num_token,
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
)
parser.add_argument(
"--context-length",
type=int,
default=ServerArgs.context_length,
help="The model's maximum context length. Use this to reduce the context length to save memory. Defaults to None (will use the value from the model's config.json instead).",
)
parser.add_argument(
"--tp-size",
type=int,
default=ServerArgs.tp_size,
help="Tensor parallelism degree.",
)
parser.add_argument(
"--schedule-heuristic",
type=str,
...
...
@@ -149,15 +157,10 @@ class ServerArgs:
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
)
parser.add_argument(
"--random-seed",
"--tp-size",
type=int,
default=ServerArgs.random_seed,
help="Random seed.",
)
parser.add_argument(
"--attention-reduce-in-fp32",
action="store_true",
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
default=ServerArgs.tp_size,
help="Tensor parallelism size.",
)
parser.add_argument(
"--stream-interval",
...
...
@@ -165,11 +168,17 @@ class ServerArgs:
default=ServerArgs.stream_interval,
help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
)
parser.add_argument(
"--random-seed",
type=int,
default=ServerArgs.random_seed,
help="Random seed.",
)
parser.add_argument(
"--log-level",
type=str,
default=ServerArgs.log_level,
help="Log level",
help="Logging level",
)
parser.add_argument(
"--disable-log-stats",
...
...
@@ -182,28 +191,33 @@ class ServerArgs:
default=ServerArgs.log_stats_interval,
help="Log stats interval in second.",
)
parser.add_argument(
"--show-time-cost",
action="store_true",
help="Show time cost of custom marks",
)
parser.add_argument(
"--api-key",
type=str,
default=ServerArgs.api_key,
help="Set API Key",
help="Set API key of the server",
)
# Optimization/debug options
parser.add_argument(
"--show-time-cost",
"--enable-flashinfer",
action="store_true",
help="Show time cost of custom marks",
help="Enable flashinfer inference kernels",
)
# optional modes
parser.add_argument(
"--disable-radix-cache",
"--attention-reduce-in-fp32",
action="store_true",
help="Disable RadixAttention",
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",