Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
7a7929ab
Unverified
Commit
7a7929ab
authored
Mar 30, 2023
by
Woosuk Kwon
Committed by
GitHub
Mar 30, 2023
Browse files
Implement preemption via recomputation & Refactor scheduling logic (#12)
parent
88c0268a
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
279 additions
and
126 deletions
+279
-126
cacheflow/http_frontend/fastapi_frontend.py
cacheflow/http_frontend/fastapi_frontend.py
+2
-1
cacheflow/master/block_manager.py
cacheflow/master/block_manager.py
+2
-1
cacheflow/master/policy.py
cacheflow/master/policy.py
+45
-0
cacheflow/master/scheduler.py
cacheflow/master/scheduler.py
+221
-120
cacheflow/master/server.py
cacheflow/master/server.py
+2
-1
cacheflow/master/simple_frontend.py
cacheflow/master/simple_frontend.py
+3
-1
cacheflow/sequence.py
cacheflow/sequence.py
+4
-2
No files found.
cacheflow/http_frontend/fastapi_frontend.py
View file @
7a7929ab
...
...
@@ -84,8 +84,9 @@ class FastAPIFrontend:
seq
=
Sequence
(
seq_id
,
token_ids
,
block_size
=
self
.
block_size
)
seqs
.
append
(
seq
)
arrival_time
=
time
.
time
()
group_id
=
next
(
self
.
seq_group_counter
)
seq_group
=
SequenceGroup
(
group_id
,
seqs
)
seq_group
=
SequenceGroup
(
group_id
,
seqs
,
arrival_time
)
group_event
=
asyncio
.
Event
()
self
.
sequence_group_events
[
group_id
]
=
group_event
await
self
.
server
.
add_sequence_groups
.
remote
([(
seq_group
,
sampling_params
)])
...
...
cacheflow/master/block_manager.py
View file @
7a7929ab
...
...
@@ -76,7 +76,8 @@ class BlockSpaceManager:
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
def
can_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
# NOTE: Here we assume that all sequences in the group have the same prompt.
# FIXME(woosuk): Here we assume that all sequences in the group share
# the same prompt. This may not be true for preempted sequences.
seq
=
seq_group
.
seqs
[
0
]
num_required_blocks
=
len
(
seq
.
logical_token_blocks
)
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
...
...
cacheflow/master/policy.py
0 → 100644
View file @
7a7929ab
from
typing
import
List
from
cacheflow.sequence
import
SequenceGroup
class
Policy
:
def
get_priority
(
self
,
now
:
float
,
seq_group
:
SequenceGroup
,
)
->
float
:
raise
NotImplementedError
def
sort_by_priority
(
self
,
now
:
float
,
seq_groups
:
List
[
SequenceGroup
],
)
->
List
[
SequenceGroup
]:
return
sorted
(
seq_groups
,
key
=
lambda
seq_group
:
self
.
get_priority
(
now
,
seq_group
),
reverse
=
True
,
)
class
FCFS
(
Policy
):
def
get_priority
(
self
,
now
:
float
,
seq_group
:
SequenceGroup
,
)
->
float
:
return
now
-
seq_group
.
arrival_time
class
PolicyFactory
:
_POLICY_REGISTRY
=
{
'fcfs'
:
FCFS
,
}
@
classmethod
def
get_policy
(
cls
,
policy_name
:
str
,
**
kwargs
)
->
Policy
:
return
cls
.
_POLICY_REGISTRY
[
policy_name
](
**
kwargs
)
cacheflow/master/scheduler.py
View file @
7a7929ab
from
typing
import
Dict
,
List
,
Tuple
import
enum
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.master.block_manager
import
BlockSpaceManager
from
cacheflow.master.policy
import
PolicyFactory
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
from
cacheflow.sequence
import
SequenceGroup
...
...
@@ -9,6 +12,19 @@ from cacheflow.sequence import SequenceOutputs
from
cacheflow.sequence
import
SequenceStatus
class
PreemptionMode
(
enum
.
Enum
):
"""Preemption modes.
1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
and swap them back in when the sequences are resumed.
2. Recomputation: Discard the blocks of the preempted sequences and
recompute them when the sequences are resumed, treating the sequences as
new prompts.
"""
SWAP
=
enum
.
auto
()
RECOMPUTE
=
enum
.
auto
()
class
Scheduler
:
def
__init__
(
...
...
@@ -25,6 +41,8 @@ class Scheduler:
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
max_num_batched_tokens
=
max_num_batched_tokens
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
# Create the block space manager.
self
.
block_manager
=
BlockSpaceManager
(
block_size
=
block_size
,
...
...
@@ -32,158 +50,140 @@ class Scheduler:
num_cpu_blocks
=
num_cpu_blocks
,
)
# Running sequence groups (FIFO).
# Sequence groups in the WAITING state.
self
.
waiting
:
List
[
SequenceGroup
]
=
[]
# Sequence groups in the RUNNING state.
self
.
running
:
List
[
SequenceGroup
]
=
[]
# Mapping: group_id -> num_steps.
self
.
num_steps
:
Dict
[
int
,
int
]
=
{}
# Mapping: group_id -> sampling params.
self
.
sampling_params
:
Dict
[
int
,
SamplingParams
]
=
{}
# Swapped sequence groups (LIFO).
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
# Pending sequence groups (FIFO).
self
.
pending
:
List
[
SequenceGroup
]
=
[]
def
add_sequence_groups
(
self
,
seq
uence
_groups
:
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]],
seq_groups
:
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]],
)
->
None
:
# Add sequence groups to the
pend
ing queue.
for
seq_group
,
sampling_params
in
seq
uence
_groups
:
self
.
pend
ing
.
append
(
seq_group
)
# Add sequence groups to the
wait
ing queue.
for
seq_group
,
sampling_params
in
seq_groups
:
self
.
wait
ing
.
append
(
seq_group
)
self
.
sampling_params
[
seq_group
.
group_id
]
=
sampling_params
def
_free_seq
(
self
,
seq
:
Sequence
)
->
None
:
seq
.
status
=
SequenceStatus
.
FINISHED
self
.
block_manager
.
free
(
seq
)
def
_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
self
.
block_manager
.
allocate
(
seq_group
)
for
seq
in
seq_group
.
seqs
:
seq
.
status
=
SequenceStatus
.
RUNNING
self
.
running
.
append
(
seq_group
)
# FIXME(woosuk): Support interactive generation.
self
.
num_steps
[
seq_group
.
group_id
]
=
0
def
_append
(
def
_schedule
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
for
seq
in
seq_group
.
seqs
:
if
seq
.
status
==
SequenceStatus
.
FINISHED
:
continue
ret
=
self
.
block_manager
.
append
(
seq
)
if
ret
is
not
None
:
src_block
,
dst_block
=
ret
if
src_block
in
blocks_to_copy
:
blocks_to_copy
[
src_block
].
append
(
dst_block
)
else
:
blocks_to_copy
[
src_block
]
=
[
dst_block
]
def
_swap_in
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
)
->
None
:
mapping
=
self
.
block_manager
.
swap_in
(
seq_group
)
blocks_to_swap_in
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
SWAPPED
):
seq
.
status
=
SequenceStatus
.
RUNNING
self
.
running
.
append
(
seq_group
)
def
_swap_out
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
)
->
None
:
assert
self
.
block_manager
.
can_swap_out
(
seq_group
)
mapping
=
self
.
block_manager
.
swap_out
(
seq_group
)
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
self
.
swapped
.
append
(
seq_group
)
def
step
(
self
)
->
List
[
SequenceGroup
]:
)
->
Tuple
[
Dict
[
int
,
int
],
Dict
[
int
,
int
],
Dict
[
int
,
List
[
int
]],
List
[
int
]]:
# Blocks that need to be swaped or copied before model execution.
blocks_to_swap_in
:
Dict
[
int
,
int
]
=
{}
blocks_to_swap_out
:
Dict
[
int
,
int
]
=
{}
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
=
{}
# 1. Reserve new slots for the running sequences.
# NOTE: Here we implicitly assume FCFS scheduling.
# That is, the most recently added sequence group is the first
# to be swapped out.
victim_idx
=
len
(
self
.
running
)
-
1
for
i
,
seq_group
in
enumerate
(
self
.
running
):
if
i
>
victim_idx
:
# The i-th sequence group has already been swapped out.
break
# OOM. Swap out the victim sequence groups.
# Fix the current time.
now
=
time
.
time
()
# NOTE(woosuk): We prioritize the sequence groups in the RUNNING state
# in order to minimize the preemption overheads.
# Preemption happens only when there is no available slot to keep all
# the sequence groups in the RUNNING state.
# In this case, the policy is responsible for deciding which sequence
# groups to preempt.
self
.
running
=
self
.
policy
.
sort_by_priority
(
now
,
self
.
running
)
# Reserve new token slots for the running sequence groups.
running
:
List
[
SequenceGroup
]
=
[]
preempted
:
List
[
SequenceGroup
]
=
[]
while
self
.
running
:
seq_group
=
self
.
running
.
pop
(
0
)
while
not
self
.
block_manager
.
can_append
(
seq_group
):
victim_seq_group
=
self
.
running
[
victim_idx
]
self
.
_swap_out
(
victim_seq_group
,
blocks_to_swap_out
)
victim_idx
-=
1
if
i
>
victim_idx
:
# No other sequence groups can be swapped out.
if
self
.
running
:
# Preempt the lowest-priority sequence groups.
victim_seq_group
=
self
.
running
.
pop
(
-
1
)
self
.
_preempt
(
victim_seq_group
,
blocks_to_swap_out
)
preempted
.
append
(
victim_seq_group
)
else
:
# No other sequence groups can be preempted.
# Preempt the current sequence group.
self
.
_preempt
(
seq_group
,
blocks_to_swap_out
)
preempted
.
append
(
seq_group
)
break
else
:
# Append new slots to the sequence group.
self
.
_append
(
seq_group
,
blocks_to_copy
)
self
.
running
=
self
.
running
[:
victim_idx
+
1
]
# 2. Swap in the swapped sequences if possible.
#
NOTE: Here we implicitly assume FCFS scheduling
.
# The
swapped
sequences are in LIFO order.
for
i
,
seq_group
in
enumerate
(
reversed
(
self
.
swapped
))
:
if
self
.
block_manager
.
can_swap_in
(
seq_group
):
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_append
(
seq_group
,
blocks_to_copy
)
else
:
# OOM. Stop swapping
.
self
.
swapped
=
self
.
swapped
[:
len
(
self
.
swapped
)
-
i
]
running
.
append
(
seq_group
)
self
.
running
=
running
#
Swap in the sequence groups in the SWAPPED state if possible
.
self
.
swapped
=
self
.
policy
.
sort_by_priority
(
now
,
self
.
swapped
)
while
self
.
swapped
:
seq_group
=
self
.
swapped
[
0
]
# If the sequence group has been preempted in this step, stop.
if
seq_group
in
preempted
:
break
# If the sequence group cannot be swapped in, stop
.
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
break
else
:
# All swapped sequences are swapped in.
self
.
swapped
.
clear
()
# Ensure that swap-in and swap-out never happen at the same timestep.
if
blocks_to_swap_in
:
assert
not
blocks_to_swap_out
seq_group
=
self
.
swapped
.
pop
(
0
)
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_append
(
seq_group
,
blocks_to_copy
)
self
.
running
.
append
(
seq_group
)
num_batched_tokens
=
sum
(
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
RUNNING
)
for
seq_group
in
self
.
running
)
# 3. Join new sequences if possible.
# NOTE: Here we implicitly assume FCFS scheduling.
# TODO(woosuk): Add a batching policy to control the batch size.
# Join waiting sequences if possible.
prompt_group_ids
:
List
[
int
]
=
[]
# NOTE(woosuk): The sequence groups in the SWAPPED state are strictly
# prioritized over the sequence groups in the WAITING state.
# This is because we want to bound the amount of CPU memory taken by
# the swapped sequence groups.
if
not
self
.
swapped
:
for
i
,
seq_group
in
enumerate
(
self
.
pending
):
self
.
waiting
=
self
.
policy
.
sort_by_priority
(
now
,
self
.
waiting
)
while
self
.
waiting
:
seq_group
=
self
.
waiting
[
0
]
# If the sequence group has been preempted in this step, stop.
if
seq_group
in
preempted
:
break
# If the sequence group cannot be allocated, stop.
if
not
self
.
block_manager
.
can_allocate
(
seq_group
):
break
# If the number of batched tokens exceeds the limit, stop.
num_prompt_tokens
=
seq_group
.
seqs
[
0
].
get_len
()
if
self
.
block_manager
.
can_allocate
(
seq_group
):
if
(
num_batched_tokens
+
num_prompt_tokens
<=
self
.
max_num_batched_tokens
):
self
.
_allocate
(
seq_group
)
num_batched_tokens
+=
num_prompt_tokens
continue
self
.
pending
=
self
.
pending
[
i
:]
break
else
:
self
.
pending
.
clear
()
if
(
num_batched_tokens
+
num_prompt_tokens
>
self
.
max_num_batched_tokens
):
break
seq_group
=
self
.
waiting
.
pop
(
0
)
self
.
_allocate
(
seq_group
)
self
.
running
.
append
(
seq_group
)
num_batched_tokens
+=
num_prompt_tokens
prompt_group_ids
.
append
(
seq_group
.
group_id
)
# 4. Create input data structures.
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
prompt_group_ids
)
def
step
(
self
)
->
List
[
SequenceGroup
]:
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_output
=
self
.
_schedule
()
blocks_to_swap_in
=
scheduler_output
[
0
]
blocks_to_swap_out
=
scheduler_output
[
1
]
blocks_to_copy
=
scheduler_output
[
2
]
prompt_group_ids
=
scheduler_output
[
3
]
# Create input data structures.
input_seq_groups
:
List
[
SequenceGroupInputs
]
=
[]
updated_seq_groups
:
List
[
SequenceGroup
]
=
self
.
running
.
copy
()
for
seq_group
in
self
.
running
:
group_id
=
seq_group
.
group_id
num_steps
=
self
.
num_steps
[
group_id
]
# NOTE(woosuk): We assume that the number of steps is 0
# for the prompt sequences.
is_prompt
=
num_steps
==
0
is_prompt
=
group_id
in
prompt_group_ids
input_tokens
:
Dict
[
int
,
List
[
int
]]
=
{}
seq_logprobs
:
Dict
[
int
,
float
]
=
{}
...
...
@@ -211,13 +211,15 @@ class Scheduler:
)
input_seq_groups
.
append
(
input_seq_group
)
# 5. Execute the first stage of the pipeline.
if
(
input_seq_groups
or
blocks_to_swap_in
or
blocks_to_swap_out
):
# Execute the first stage of the pipeline.
if
input_seq_groups
or
blocks_to_swap_in
or
blocks_to_swap_out
:
# Swap in and swap out should never happen at the same time.
assert
not
(
blocks_to_swap_in
and
blocks_to_swap_out
)
self
.
controllers
[
0
].
execute_stage
(
input_seq_groups
,
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_swap_out
=
blocks_to_swap_out
,
blocks_to_copy
=
blocks_to_copy
,
)
return
updated_seq_groups
...
...
@@ -276,7 +278,106 @@ class Scheduler:
running
.
append
(
seq_group
)
self
.
running
=
running
def
_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
self
.
block_manager
.
allocate
(
seq_group
)
for
seq
in
seq_group
.
seqs
:
seq
.
status
=
SequenceStatus
.
RUNNING
# FIXME(woosuk): Support interactive generation.
if
seq_group
.
group_id
not
in
self
.
num_steps
:
self
.
num_steps
[
seq_group
.
group_id
]
=
0
def
_append
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
ret
=
self
.
block_manager
.
append
(
seq
)
if
ret
is
not
None
:
src_block
,
dst_block
=
ret
if
src_block
in
blocks_to_copy
:
blocks_to_copy
[
src_block
].
append
(
dst_block
)
else
:
blocks_to_copy
[
src_block
]
=
[
dst_block
]
def
_preempt
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
preemption_mode
:
Optional
[
PreemptionMode
]
=
None
,
)
->
None
:
# If preemption mode is not specified, we determine the mode as follows:
# We use recomputation by default since it incurs lower overhead than
# swapping. However, when the sequence group has multiple sequences
# (e.g., beam search), recomputation is not supported. In such a case,
# we use swapping instead.
# FIXME(woosuk): This makes our scheduling policy a bit bizarre.
# As swapped sequences are prioritized over waiting sequences,
# sequence groups with multiple sequences are implicitly prioritized
# over sequence groups with a single sequence.
# TODO(woosuk): Support recomputation for sequence groups with multiple
# sequences. This may require a more sophisticated CUDA kernel.
if
preemption_mode
is
None
:
seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
if
len
(
seqs
)
==
1
:
preemption_mode
=
PreemptionMode
.
RECOMPUTE
else
:
preemption_mode
=
PreemptionMode
.
SWAP
if
preemption_mode
==
PreemptionMode
.
RECOMPUTE
:
self
.
_preempt_by_recompute
(
seq_group
)
elif
preemption_mode
==
PreemptionMode
.
SWAP
:
self
.
_preempt_by_swap
(
seq_group
,
blocks_to_swap_out
)
else
:
assert
False
,
'Invalid preemption mode.'
def
_preempt_by_recompute
(
self
,
seq_group
:
SequenceGroup
,
)
->
None
:
seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
assert
len
(
seqs
)
==
1
for
seq
in
seqs
:
seq
.
status
=
SequenceStatus
.
WAITING
self
.
block_manager
.
free
(
seq
)
self
.
waiting
.
append
(
seq_group
)
def
_preempt_by_swap
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
)
->
None
:
seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
for
seq
in
seqs
:
seq
.
status
=
SequenceStatus
.
SWAPPED
self
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
self
.
swapped
.
append
(
seq_group
)
def
_free_seq
(
self
,
seq
:
Sequence
)
->
None
:
seq
.
status
=
SequenceStatus
.
FINISHED
self
.
block_manager
.
free
(
seq
)
def
_free_seq_group
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
group_id
=
seq_group
.
group_id
del
self
.
num_steps
[
group_id
]
del
self
.
sampling_params
[
group_id
]
def
_swap_in
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
)
->
None
:
mapping
=
self
.
block_manager
.
swap_in
(
seq_group
)
blocks_to_swap_in
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
SWAPPED
):
seq
.
status
=
SequenceStatus
.
RUNNING
def
_swap_out
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
)
->
None
:
assert
self
.
block_manager
.
can_swap_out
(
seq_group
)
mapping
=
self
.
block_manager
.
swap_out
(
seq_group
)
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
cacheflow/master/server.py
View file @
7a7929ab
...
...
@@ -10,6 +10,7 @@ from cacheflow.worker.controller import Controller, DeviceID
from
cacheflow.sequence
import
SequenceGroup
from
cacheflow.sampling_params
import
SamplingParams
class
Server
:
def
__init__
(
self
,
...
...
@@ -91,7 +92,7 @@ class Server:
return
self
.
scheduler
.
step
()
def
has_unfinished_requests
(
self
):
return
(
self
.
scheduler
.
pend
ing
or
self
.
scheduler
.
running
or
return
(
self
.
scheduler
.
wait
ing
or
self
.
scheduler
.
running
or
self
.
scheduler
.
swapped
)
...
...
cacheflow/master/simple_frontend.py
View file @
7a7929ab
import
time
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
transformers
import
AutoTokenizer
...
...
@@ -39,6 +40,7 @@ class SimpleFrontend:
token_ids
:
List
[
int
],
sampling_params
:
SamplingParams
,
)
->
None
:
arrival_time
=
time
.
time
()
seqs
:
List
[
Sequence
]
=
[]
for
_
in
range
(
sampling_params
.
n
):
seq_id
=
next
(
self
.
seq_counter
)
...
...
@@ -46,7 +48,7 @@ class SimpleFrontend:
seqs
.
append
(
seq
)
group_id
=
next
(
self
.
seq_group_counter
)
seq_group
=
SequenceGroup
(
group_id
,
seqs
)
seq_group
=
SequenceGroup
(
group_id
,
seqs
,
arrival_time
)
self
.
inputs
.
append
((
seq_group
,
sampling_params
))
def
get_inputs
(
self
)
->
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]]:
...
...
cacheflow/sequence.py
View file @
7a7929ab
...
...
@@ -7,7 +7,7 @@ from cacheflow.sampling_params import SamplingParams
class
SequenceStatus
(
enum
.
Enum
):
PEND
ING
=
enum
.
auto
()
WAIT
ING
=
enum
.
auto
()
RUNNING
=
enum
.
auto
()
SWAPPED
=
enum
.
auto
()
FINISHED
=
enum
.
auto
()
...
...
@@ -28,7 +28,7 @@ class Sequence:
# Initialize the logical token blocks with the given token ids.
self
.
add
(
token_ids
)
self
.
status
=
SequenceStatus
.
PEND
ING
self
.
status
=
SequenceStatus
.
WAIT
ING
self
.
output_logprobs
:
List
[
Dict
[
int
,
float
]]
=
[]
self
.
cumulative_logprobs
=
0.0
...
...
@@ -88,9 +88,11 @@ class SequenceGroup:
self
,
group_id
:
int
,
seqs
:
List
[
Sequence
],
arrival_time
:
float
,
)
->
None
:
self
.
group_id
=
group_id
self
.
seqs
=
seqs
self
.
arrival_time
=
arrival_time
def
get_seqs
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment