Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e3eb146f
Unverified
Commit
e3eb146f
authored
Feb 28, 2026
by
Woosuk Kwon
Committed by
GitHub
Feb 28, 2026
Browse files
[Model Runner V2] Add ModelStateInterface [4/N] (#35621)
Signed-off-by:
Woosuk Kwon
<
woosuk@inferact.ai
>
parent
95a395db
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
90 additions
and
4 deletions
+90
-4
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+1
-1
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+2
-2
vllm/v1/worker/gpu/model_states/__init__.py
vllm/v1/worker/gpu/model_states/__init__.py
+18
-0
vllm/v1/worker/gpu/model_states/default.py
vllm/v1/worker/gpu/model_states/default.py
+2
-1
vllm/v1/worker/gpu/model_states/interface.py
vllm/v1/worker/gpu/model_states/interface.py
+67
-0
No files found.
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
e3eb146f
...
...
@@ -22,7 +22,7 @@ from vllm.v1.worker.gpu.attn_utils import (
from
vllm.v1.worker.gpu.block_table
import
BlockTables
from
vllm.v1.worker.gpu.dp_utils
import
make_num_tokens_across_dp
from
vllm.v1.worker.gpu.input_batch
import
InputBuffers
from
vllm.v1.worker.gpu.model_states
import
ModelState
from
vllm.v1.worker.gpu.model_states
.interface
import
ModelState
from
vllm.v1.worker.utils
import
AttentionGroup
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
e3eb146f
...
...
@@ -78,7 +78,7 @@ from vllm.v1.worker.gpu.kv_connector import (
)
from
vllm.v1.worker.gpu.lora_utils
import
LoraState
from
vllm.v1.worker.gpu.mm.encoder_cache
import
EncoderCache
from
vllm.v1.worker.gpu.model_states
import
M
odel
S
tate
from
vllm.v1.worker.gpu.model_states
import
init_m
odel
_s
tate
from
vllm.v1.worker.gpu.pool.pooling_runner
import
PoolingRunner
from
vllm.v1.worker.gpu.pp_utils
import
pp_broadcast
,
pp_receive
from
vllm.v1.worker.gpu.sample.output
import
SamplerOutput
...
...
@@ -267,7 +267,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
prepare_communication_buffer_for_model
(
self
.
speculator
)
# Initialize the components that require the model.
self
.
model_state
=
M
odel
S
tate
(
self
.
model_state
=
init_m
odel
_s
tate
(
self
.
vllm_config
,
self
.
model
,
self
.
encoder_cache
,
self
.
device
)
if
self
.
is_pooling_model
:
...
...
vllm/v1/worker/gpu/model_states/__init__.py
0 → 100644
View file @
e3eb146f
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.v1.worker.gpu.mm.encoder_cache
import
EncoderCache
def
init_model_state
(
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
encoder_cache
:
EncoderCache
|
None
,
device
:
torch
.
device
,
):
from
vllm.v1.worker.gpu.model_states.default
import
DefaultModelState
return
DefaultModelState
(
vllm_config
,
model
,
encoder_cache
,
device
)
vllm/v1/worker/gpu/model_states.py
→
vllm/v1/worker/gpu/model_states
/default
.py
View file @
e3eb146f
...
...
@@ -13,11 +13,12 @@ from vllm.v1.worker.gpu.input_batch import InputBatch
from
vllm.v1.worker.gpu.mm.encoder_cache
import
EncoderCache
from
vllm.v1.worker.gpu.mm.encoder_runner
import
EncoderRunner
from
vllm.v1.worker.gpu.mm.mrope_utils
import
MRopeState
from
vllm.v1.worker.gpu.model_states.interface
import
ModelState
from
vllm.v1.worker.gpu.states
import
RequestState
from
vllm.v1.worker.utils
import
AttentionGroup
class
ModelState
:
class
Default
ModelState
(
ModelState
)
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
...
...
vllm/v1/worker/gpu/model_states/interface.py
0 → 100644
View file @
e3eb146f
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
import
torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.v1.core.sched.output
import
NewRequestData
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.worker.gpu.input_batch
import
InputBatch
from
vllm.v1.worker.gpu.mm.encoder_cache
import
EncoderCache
from
vllm.v1.worker.gpu.states
import
RequestState
from
vllm.v1.worker.utils
import
AttentionGroup
class
ModelState
(
ABC
):
@
abstractmethod
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
encoder_cache
:
EncoderCache
|
None
,
device
:
torch
.
device
,
)
->
None
:
raise
NotImplementedError
@
abstractmethod
def
add_request
(
self
,
req_index
:
int
,
new_req_data
:
NewRequestData
)
->
None
:
raise
NotImplementedError
@
abstractmethod
def
apply_staged_writes
(
self
)
->
None
:
raise
NotImplementedError
@
abstractmethod
def
get_mm_embeddings
(
self
,
scheduled_encoder_inputs
:
dict
[
str
,
list
[
int
]],
input_batch
:
InputBatch
,
req_states
:
RequestState
,
)
->
torch
.
Tensor
:
raise
NotImplementedError
@
abstractmethod
def
prepare_inputs
(
self
,
input_batch
:
InputBatch
,
req_states
:
RequestState
)
->
dict
[
str
,
torch
.
Tensor
|
None
]:
raise
NotImplementedError
@
abstractmethod
def
prepare_dummy_inputs
(
self
,
num_reqs
:
int
,
num_tokens
:
int
)
->
dict
[
str
,
torch
.
Tensor
|
None
]:
raise
NotImplementedError
@
abstractmethod
def
prepare_attn
(
self
,
input_batch
:
InputBatch
,
block_tables
:
tuple
[
torch
.
Tensor
,
...],
slot_mappings
:
torch
.
Tensor
,
attn_groups
:
list
[
list
[
AttentionGroup
]],
kv_cache_config
:
KVCacheConfig
,
)
->
dict
[
str
,
Any
]:
raise
NotImplementedError
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment