help="Whether to use a CausalLM as an embedding model.",
)
# Memory and scheduling
parser.add_argument(
"--mem-fraction-static",
type=float,
...
...
@@ -368,6 +372,8 @@ class ServerArgs:
default=ServerArgs.schedule_conservativeness,
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
)
parser.add_argument(
"--watchdog-timeout",
type=float,
default=ServerArgs.watchdog_timeout,
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
)
# Logging
parser.add_argument(
"--log-level",
type=str,
...
...
@@ -420,7 +434,14 @@ class ServerArgs:
action="store_true",
help="Enable log prometheus metrics.",
)
parser.add_argument(
"--decode-log-interval",
type=int,
default=ServerArgs.decode_log_interval,
help="The log interval of decode batch",
)
# API related
parser.add_argument(
"--api-key",
type=str,
...
...
@@ -438,18 +459,6 @@ class ServerArgs:
action="store_true",
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
)
parser.add_argument(
"--watchdog-timeout",
type=float,
default=ServerArgs.watchdog_timeout,
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
)
parser.add_argument(
"--decode-log-interval",
type=int,
default=ServerArgs.decode_log_interval,
help="The log interval of decode batch",
)
# Data parallelism
parser.add_argument(
...
...
@@ -470,7 +479,7 @@ class ServerArgs:
],
)
# Multi-node distributed serving args
# Multi-node distributed serving
parser.add_argument(
"--dist-init-addr",
"--nccl-init-addr",# For backward compatbility. This will be removed in the future.
...
...
@@ -677,6 +686,12 @@ class ServerArgs:
"This can potentially increase throughput but may also increase time-to-first-token latency. "
"The default value is 1, meaning only run one decoding step at a time.",
)
parser.add_argument(
"--delete-ckpt-after-loading",
default=ServerArgs.delete_ckpt_after_loading,
action="store_true",
help="Delete the model checkpoint after loading the model.",