Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
3ddd7dc9
"examples/vscode:/vscode.git/clone" did not exist on "0fbaff6cddb1d68823fc39878b2962460073343c"
Unverified
Commit
3ddd7dc9
authored
Oct 07, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 07, 2025
Browse files
Introduce future indices (#11301)
parent
501dfa6b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
25 deletions
+25
-25
python/sglang/srt/managers/overlap_utils.py
python/sglang/srt/managers/overlap_utils.py
+17
-14
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+8
-11
No files found.
python/sglang/srt/managers/overlap_utils.py
View file @
3ddd7dc9
from
dataclasses
import
dataclass
from
typing
import
Optional
import
torch
import
torch
from
sglang.srt.managers.schedule_batch
import
ModelWorkerBatch
from
sglang.srt.managers.schedule_batch
import
ModelWorkerBatch
...
@@ -13,6 +16,12 @@ def _resolve_future_token_ids(input_ids, future_token_ids_map):
...
@@ -13,6 +16,12 @@ def _resolve_future_token_ids(input_ids, future_token_ids_map):
)
)
@
dataclass
class
FutureIndices
:
indices
:
torch
.
Tensor
interval
:
Optional
[
slice
]
=
None
class
FutureMap
:
class
FutureMap
:
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -30,23 +39,17 @@ class FutureMap:
...
@@ -30,23 +39,17 @@ class FutureMap:
(
self
.
future_buffer_len
,),
dtype
=
torch
.
int64
,
device
=
self
.
device
(
self
.
future_buffer_len
,),
dtype
=
torch
.
int64
,
device
=
self
.
device
)
)
def
update_ct
(
self
,
bs
:
int
)
->
int
:
def
alloc_future_indices
(
self
,
bs
:
int
)
->
FutureIndices
:
"""Update the circular buffer pointer and
return the current pointer
."""
"""Update the circular buffer pointer and
allocate future indices
."""
cur_future_ct
=
self
.
future_ct
cur_future_ct
=
self
.
future_ct
self
.
future_ct
=
(
cur_future_ct
+
bs
)
%
self
.
future_limit
self
.
future_ct
=
(
cur_future_ct
+
bs
)
%
self
.
future_limit
return
cur_future_ct
start
=
cur_future_ct
+
1
end
=
cur_future_ct
+
1
+
bs
indices
=
torch
.
arange
(
start
,
end
,
dtype
=
torch
.
int64
,
device
=
self
.
device
)
return
FutureIndices
(
indices
=
indices
,
interval
=
slice
(
start
,
end
))
def
resolve_future
(
self
,
model_worker_batch
:
ModelWorkerBatch
):
def
resolve_future
(
self
,
model_worker_batch
:
ModelWorkerBatch
):
_resolve_future_token_ids
(
model_worker_batch
.
input_ids
,
self
.
token_ids_buf
)
_resolve_future_token_ids
(
model_worker_batch
.
input_ids
,
self
.
token_ids_buf
)
def
update_next_future
(
self
,
future_ct
:
int
,
bs
:
int
):
def
store_to_map
(
self
,
future_indices
:
FutureIndices
,
next_token_ids
:
torch
.
Tensor
):
return
torch
.
arange
(
self
.
token_ids_buf
[
future_indices
.
interval
]
=
next_token_ids
-
(
future_ct
+
1
),
-
(
future_ct
+
1
+
bs
),
-
1
,
dtype
=
torch
.
int64
,
device
=
self
.
device
,
)
def
store_to_map
(
self
,
future_ct
:
int
,
bs
:
int
,
next_token_ids
:
torch
.
Tensor
):
self
.
token_ids_buf
[
future_ct
+
1
:
future_ct
+
bs
+
1
]
=
next_token_ids
python/sglang/srt/managers/scheduler.py
View file @
3ddd7dc9
...
@@ -114,7 +114,7 @@ from sglang.srt.managers.io_struct import (
...
@@ -114,7 +114,7 @@ from sglang.srt.managers.io_struct import (
UpdateWeightsFromTensorReqInput
,
UpdateWeightsFromTensorReqInput
,
)
)
from
sglang.srt.managers.mm_utils
import
init_embedding_cache
from
sglang.srt.managers.mm_utils
import
init_embedding_cache
from
sglang.srt.managers.overlap_utils
import
FutureMap
from
sglang.srt.managers.overlap_utils
import
FutureIndices
,
FutureMap
from
sglang.srt.managers.schedule_batch
import
(
from
sglang.srt.managers.schedule_batch
import
(
FINISH_ABORT
,
FINISH_ABORT
,
ModelWorkerBatch
,
ModelWorkerBatch
,
...
@@ -217,7 +217,7 @@ class GenerationBatchResult:
...
@@ -217,7 +217,7 @@ class GenerationBatchResult:
copy_done
:
Optional
[
torch
.
cuda
.
Event
]
=
None
copy_done
:
Optional
[
torch
.
cuda
.
Event
]
=
None
delay_sample_launch
:
bool
=
False
delay_sample_launch
:
bool
=
False
forward_batch
:
Optional
[
ForwardBatch
]
=
None
forward_batch
:
Optional
[
ForwardBatch
]
=
None
future_
map_ct
:
Optional
[
int
]
=
None
future_
indices
:
Optional
[
FutureIndices
]
=
None
def
copy_to_cpu
(
self
,
return_logprob
:
bool
=
False
):
def
copy_to_cpu
(
self
,
return_logprob
:
bool
=
False
):
"""Copy tensors to CPU in overlap scheduling.
"""Copy tensors to CPU in overlap scheduling.
...
@@ -2092,7 +2092,7 @@ class Scheduler(
...
@@ -2092,7 +2092,7 @@ class Scheduler(
)
)
bs
=
len
(
model_worker_batch
.
seq_lens
)
bs
=
len
(
model_worker_batch
.
seq_lens
)
cur_
future_
map_ct
=
self
.
future_map
.
update_ct
(
bs
)
future_
indices
=
self
.
future_map
.
alloc_future_indices
(
bs
)
with
self
.
forward_stream_ctx
:
with
self
.
forward_stream_ctx
:
self
.
forward_stream
.
wait_stream
(
self
.
default_stream
)
self
.
forward_stream
.
wait_stream
(
self
.
default_stream
)
...
@@ -2108,22 +2108,19 @@ class Scheduler(
...
@@ -2108,22 +2108,19 @@ class Scheduler(
).
Event
()
).
Event
()
if
not
model_worker_batch
.
delay_sample_launch
:
if
not
model_worker_batch
.
delay_sample_launch
:
self
.
future_map
.
store_to_map
(
self
.
future_map
.
store_to_map
(
cur_
future_
map_ct
,
b
s
,
batch_result
.
next_token_ids
future_
indice
s
,
batch_result
.
next_token_ids
)
)
batch_result
.
copy_to_cpu
()
batch_result
.
copy_to_cpu
()
else
:
else
:
batch_result
.
future_
map_ct
=
cur_future_map_ct
batch_result
.
future_
indices
=
future_indices
# FIXME(lsyin): move this assignment elsewhere
# FIXME(lsyin): move this assignment elsewhere
maybe_future_next_token_ids
=
self
.
future_map
.
update_next_future
(
maybe_future_next_token_ids
=
-
future_indices
.
indices
cur_future_map_ct
,
bs
)
else
:
else
:
batch_result
=
self
.
model_worker
.
forward_batch_generation
(
batch_result
=
self
.
model_worker
.
forward_batch_generation
(
batch_or_worker_batch
batch_or_worker_batch
)
)
maybe_future_next_token_ids
=
batch_result
.
next_token_ids
maybe_future_next_token_ids
=
batch_result
.
next_token_ids
copy_done
=
None
if
not
self
.
spec_algorithm
.
is_none
():
if
not
self
.
spec_algorithm
.
is_none
():
# TODO(lsyin): unify this metric-updating logic with non-spec, and move it to decode processing
# TODO(lsyin): unify this metric-updating logic with non-spec, and move it to decode processing
...
@@ -2182,8 +2179,8 @@ class Scheduler(
...
@@ -2182,8 +2179,8 @@ class Scheduler(
tmp_result
.
logits_output
,
tmp_result
.
logits_output
,
tmp_result
.
forward_batch
,
tmp_result
.
forward_batch
,
)
)
ct
,
b
s
=
tmp_result
.
future_
map_ct
,
len
(
tmp_batch
.
reqs
)
future_indice
s
=
tmp_result
.
future_
indices
self
.
future_map
.
store_to_map
(
ct
,
b
s
,
tmp_result
.
next_token_ids
)
self
.
future_map
.
store_to_map
(
future_indice
s
,
tmp_result
.
next_token_ids
)
tmp_result
.
copy_to_cpu
()
tmp_result
.
copy_to_cpu
()
self
.
result_queue
.
appendleft
((
tmp_batch
,
tmp_result
))
self
.
result_queue
.
appendleft
((
tmp_batch
,
tmp_result
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment