help="If set, skip init tokenizer and pass input_ids in generate request.",
help="If set, skip init tokenizer and pass input_ids in generate request.",
)
)
parser.add_argument(
"--enable-tokenizer-batch-encode",
action="store_true",
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
)
parser.add_argument(
parser.add_argument(
"--load-format",
"--load-format",
type=str,
type=str,
...
@@ -603,6 +603,12 @@ class ServerArgs:
...
@@ -603,6 +603,12 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Whether to use a CausalLM as an embedding model.",
help="Whether to use a CausalLM as an embedding model.",
)
)
parser.add_argument(
"--enable-multimodal",
default=ServerArgs.enable_multimodal,
action="store_true",
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
)
parser.add_argument(
parser.add_argument(
"--revision",
"--revision",
type=str,
type=str,
...
@@ -780,6 +786,33 @@ class ServerArgs:
...
@@ -780,6 +786,33 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enable log prometheus metrics.",
help="Enable log prometheus metrics.",
)
)
parser.add_argument(
"--bucket-time-to-first-token",
type=float,
nargs="+",
default=ServerArgs.bucket_time_to_first_token,
help="The buckets of time to first token, specified as a list of floats.",
)
parser.add_argument(
"--bucket-inter-token-latency",
type=float,
nargs="+",
default=ServerArgs.bucket_inter_token_latency,
help="The buckets of inter-token latency, specified as a list of floats.",
)
parser.add_argument(
"--bucket-e2e-request-latency",
type=float,
nargs="+",
default=ServerArgs.bucket_e2e_request_latency,
help="The buckets of end-to-end request latency, specified as a list of floats.",
help="A dictionary in JSON string format used to override default model configurations.",
help="A dictionary in JSON string format used to override default model configurations.",
default=ServerArgs.json_model_override_args,
default=ServerArgs.json_model_override_args,
)
)
parser.add_argument(
"--preferred-sampling-params",
type=str,
help="json-formatted sampling settings that will be returned in /get_model_info",
)
# LoRA
# LoRA
parser.add_argument(
parser.add_argument(
...
@@ -1043,6 +1081,11 @@ class ServerArgs:
...
@@ -1043,6 +1081,11 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enable NCCL NVLS for prefill heavy requests when available.",
help="Enable NCCL NVLS for prefill heavy requests when available.",
)
)
parser.add_argument(
"--enable-tokenizer-batch-encode",
action="store_true",
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
)
parser.add_argument(
parser.add_argument(
"--disable-outlines-disk-cache",
"--disable-outlines-disk-cache",
action="store_true",
action="store_true",
...
@@ -1053,12 +1096,6 @@ class ServerArgs:
...
@@ -1053,12 +1096,6 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Disable the custom all-reduce kernel and fall back to NCCL.",
help="Disable the custom all-reduce kernel and fall back to NCCL.",
)
)
parser.add_argument(
"--enable-multimodal",
default=ServerArgs.enable_multimodal,
action="store_true",
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",