help="The data type of the SSM states in mamba cache.",
)
# Args for multi-item-scoring
parser.add_argument(
"--multi-item-scoring-delimiter",
type=int,
default=ServerArgs.multi_item_scoring_delimiter,
help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
)
# Hierarchical cache
parser.add_argument(
"--enable-hierarchical-cache",
...
...
@@ -3004,6 +3016,17 @@ class ServerArgs:
"lof",
],f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
# Check multi-item scoring
ifself.multi_item_scoring_delimiterisnotNone:
assertself.disable_radix_cache,(
"Multi-item scoring requires radix cache to be disabled. "
"Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
)
assertself.chunked_prefill_size==-1,(
"Multi-item scoring requires chunked prefill to be disabled. "
"Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
)
defcheck_lora_server_args(self):
assertself.max_loras_per_batch>0,"max_loras_per_batch must be positive"