Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e4d652ea
"cacheflow/vscode:/vscode.git/clone" did not exist on "057daef778ec4e951841f44afda1cd0b1eb50ee4"
Unverified
Commit
e4d652ea
authored
Oct 10, 2024
by
youkaichao
Committed by
GitHub
Oct 10, 2024
Browse files
[torch.compile] integration with compilation control (#9058)
parent
78c0b416
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
8 deletions
+17
-8
vllm/sequence.py
vllm/sequence.py
+3
-4
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+14
-4
No files found.
vllm/sequence.py
View file @
e4d652ea
...
@@ -1137,10 +1137,9 @@ class EmbeddingSequenceGroupOutput(
...
@@ -1137,10 +1137,9 @@ class EmbeddingSequenceGroupOutput(
return
self
.
embeddings
==
other
.
embeddings
return
self
.
embeddings
==
other
.
embeddings
class
IntermediateTensors
(
# cannot use msgspec.Struct here because Dynamo does not support it
msgspec
.
Struct
,
@
dataclass
omit_defaults
=
True
,
# type: ignore[call-arg]
class
IntermediateTensors
:
array_like
=
True
):
# type: ignore[call-arg]
"""For all pipeline stages except the last, we need to return the hidden
"""For all pipeline stages except the last, we need to return the hidden
states and residuals to be sent to the next stage. This data structure
states and residuals to be sent to the next stage. This data structure
contains the hidden states and residuals for a request.
contains the hidden states and residuals for a request.
...
...
vllm/worker/model_runner.py
View file @
e4d652ea
...
@@ -18,6 +18,8 @@ import vllm.envs as envs
...
@@ -18,6 +18,8 @@ import vllm.envs as envs
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.attention.backends.abstract
import
AttentionState
from
vllm.attention.backends.abstract
import
AttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.compilation.compile_context
import
set_compile_context
from
vllm.compilation.levels
import
CompilationLevel
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
)
PromptAdapterConfig
,
SchedulerConfig
)
...
@@ -1126,10 +1128,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1126,10 +1128,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"provided. Defaulting to scaling factors of 1.0. "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!"
)
"This may lead to less accurate results!"
)
if
envs
.
VLLM_T
EST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
if
envs
.
VLLM_T
ORCH_COMPILE_LEVEL
==
CompilationLevel
.
DYNAMO_AS_IS
\
from
vllm.compilation.backends
import
vllm_backend
and
supports_dynamo
():
from
vllm.plugins
import
get_torch_compile_backend
from
vllm.plugins
import
get_torch_compile_backend
backend
=
get_torch_compile_backend
()
or
vllm_backend
backend
=
get_torch_compile_backend
()
or
"eager"
self
.
model
=
torch
.
compile
(
self
.
model
=
torch
.
compile
(
self
.
model
,
self
.
model
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
...
@@ -1289,7 +1291,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1289,7 +1291,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
batch_size
=
batch_size
,
batch_size
=
batch_size
,
dtype
=
self
.
model_config
.
dtype
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
)
device
=
self
.
device
)
self
.
execute_model
(
model_input
,
kv_caches
,
intermediate_tensors
)
graph_batch_size
=
self
.
max_batchsize_to_capture
batch_size_capture_list
=
[
bs
for
bs
in
_BATCH_SIZES_TO_CAPTURE
if
bs
<=
graph_batch_size
]
if
self
.
model_config
.
enforce_eager
:
batch_size_capture_list
=
[]
with
set_compile_context
(
batch_size_capture_list
):
self
.
execute_model
(
model_input
,
kv_caches
,
intermediate_tensors
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
return
return
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment