## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
# he log probs of output tokens, if RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature.
# he log probs of output tokens, if SGLANG_RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature.
next_token_logprobs:Optional[torch.Tensor]=None
next_token_logprobs:Optional[torch.Tensor]=None
# The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
# The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
help="The ratio of mamba state memory to full kv cache memory.",
help="The ratio of mamba state memory to full kv cache memory.",
)
)
# Args for multi-item-scoring
parser.add_argument(
"--multi-item-scoring-delimiter",
type=int,
default=ServerArgs.multi_item_scoring_delimiter,
help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
)
# Hierarchical cache
# Hierarchical cache
parser.add_argument(
parser.add_argument(
"--enable-hierarchical-cache",
"--enable-hierarchical-cache",
...
@@ -2636,6 +2630,14 @@ class ServerArgs:
...
@@ -2636,6 +2630,14 @@ class ServerArgs:
help="Mode of offloading.",
help="Mode of offloading.",
)
)
# Args for multi-item-scoring
parser.add_argument(
"--multi-item-scoring-delimiter",
type=int,
default=ServerArgs.multi_item_scoring_delimiter,
help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",