Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4cb5a523
Unverified
Commit
4cb5a523
authored
Oct 05, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 05, 2025
Browse files
Tiny `skip_sample` adjust (#11225)
parent
85c1f793
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
17 additions
and
18 deletions
+17
-18
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+5
-1
python/sglang/srt/managers/tp_worker.py
python/sglang/srt/managers/tp_worker.py
+10
-13
python/sglang/srt/managers/tp_worker_overlap_thread.py
python/sglang/srt/managers/tp_worker_overlap_thread.py
+0
-2
python/sglang/srt/speculative/eagle_worker.py
python/sglang/srt/speculative/eagle_worker.py
+1
-1
python/sglang/srt/speculative/ngram_worker.py
python/sglang/srt/speculative/ngram_worker.py
+1
-1
No files found.
python/sglang/srt/managers/schedule_batch.py
View file @
4cb5a523
...
@@ -663,7 +663,11 @@ class Req:
...
@@ -663,7 +663,11 @@ class Req:
@
property
@
property
def
is_prefill_only
(
self
)
->
bool
:
def
is_prefill_only
(
self
)
->
bool
:
"""Check if this request is prefill-only (no token generation needed)."""
"""Check if this request is prefill-only (no token generation needed)."""
return
self
.
sampling_params
.
max_new_tokens
==
0
# NOTE: when spec is enabled, prefill_only optimizations are disabled
return
(
self
.
sampling_params
.
max_new_tokens
==
0
and
global_server_args_dict
[
"speculative_algorithm"
]
is
None
)
def
add_latency
(
self
,
stage
:
RequestStage
):
def
add_latency
(
self
,
stage
:
RequestStage
):
if
self
.
metrics_collector
is
None
:
if
self
.
metrics_collector
is
None
:
...
...
python/sglang/srt/managers/tp_worker.py
View file @
4cb5a523
...
@@ -237,7 +237,7 @@ class TpModelWorker:
...
@@ -237,7 +237,7 @@ class TpModelWorker:
self
,
self
,
model_worker_batch
:
ModelWorkerBatch
,
model_worker_batch
:
ModelWorkerBatch
,
launch_done
:
Optional
[
threading
.
Event
]
=
None
,
launch_done
:
Optional
[
threading
.
Event
]
=
None
,
skip_sample
:
bool
=
False
,
is_verify
:
bool
=
False
,
)
->
ForwardBatchOutput
:
)
->
ForwardBatchOutput
:
# update the consumer index of hicache to the running batch
# update the consumer index of hicache to the running batch
self
.
set_hicache_consumer
(
model_worker_batch
.
hicache_consumer_index
)
self
.
set_hicache_consumer
(
model_worker_batch
.
hicache_consumer_index
)
...
@@ -259,19 +259,16 @@ class TpModelWorker:
...
@@ -259,19 +259,16 @@ class TpModelWorker:
if
launch_done
is
not
None
:
if
launch_done
is
not
None
:
launch_done
.
set
()
launch_done
.
set
()
if
skip_sample
:
skip_sample
=
is_verify
or
model_worker_batch
.
is_prefill_only
next_token_ids
=
None
next_token_ids
=
None
# For prefill-only requests, we still need to compute logprobs even when sampling is skipped
if
(
if
not
skip_sample
:
model_worker_batch
.
is_prefill_only
and
model_worker_batch
.
return_logprob
):
# Compute logprobs without full sampling
self
.
model_runner
.
compute_logprobs_only
(
logits_output
,
model_worker_batch
)
else
:
next_token_ids
=
self
.
model_runner
.
sample
(
logits_output
,
forward_batch
)
next_token_ids
=
self
.
model_runner
.
sample
(
logits_output
,
forward_batch
)
elif
model_worker_batch
.
return_logprob
and
not
is_verify
:
# NOTE: Compute logprobs without full sampling
self
.
model_runner
.
compute_logprobs_only
(
logits_output
,
model_worker_batch
)
return
ForwardBatchOutput
(
return
ForwardBatchOutput
(
logits_output
=
logits_output
,
logits_output
=
logits_output
,
...
...
python/sglang/srt/managers/tp_worker_overlap_thread.py
View file @
4cb5a523
...
@@ -164,8 +164,6 @@ class TpModelWorkerClient:
...
@@ -164,8 +164,6 @@ class TpModelWorkerClient:
forward_batch_output
=
self
.
worker
.
forward_batch_generation
(
forward_batch_output
=
self
.
worker
.
forward_batch_generation
(
model_worker_batch
,
model_worker_batch
,
model_worker_batch
.
launch_done
,
model_worker_batch
.
launch_done
,
# Skip sampling for prefill-only requests
skip_sample
=
model_worker_batch
.
is_prefill_only
,
)
)
logits_output
,
next_token_ids
,
can_run_cuda_graph
=
(
logits_output
,
next_token_ids
,
can_run_cuda_graph
=
(
...
...
python/sglang/srt/speculative/eagle_worker.py
View file @
4cb5a523
...
@@ -823,7 +823,7 @@ class EAGLEWorker(TpModelWorker):
...
@@ -823,7 +823,7 @@ class EAGLEWorker(TpModelWorker):
# Forward
# Forward
forward_batch_output
=
self
.
target_worker
.
forward_batch_generation
(
forward_batch_output
=
self
.
target_worker
.
forward_batch_generation
(
model_worker_batch
,
skip_sample
=
True
model_worker_batch
,
is_verify
=
True
)
)
logits_output
,
can_run_cuda_graph
=
(
logits_output
,
can_run_cuda_graph
=
(
forward_batch_output
.
logits_output
,
forward_batch_output
.
logits_output
,
...
...
python/sglang/srt/speculative/ngram_worker.py
View file @
4cb5a523
...
@@ -214,7 +214,7 @@ class NGRAMWorker:
...
@@ -214,7 +214,7 @@ class NGRAMWorker:
if
model_worker_batch
.
forward_mode
.
is_target_verify
():
if
model_worker_batch
.
forward_mode
.
is_target_verify
():
forward_batch_output
=
self
.
target_worker
.
forward_batch_generation
(
forward_batch_output
=
self
.
target_worker
.
forward_batch_generation
(
model_worker_batch
,
skip_sample
=
True
model_worker_batch
,
is_verify
=
True
)
)
logits_output
,
can_run_cuda_graph
=
(
logits_output
,
can_run_cuda_graph
=
(
forward_batch_output
.
logits_output
,
forward_batch_output
.
logits_output
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment