Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a7f65c2b
"vscode:/vscode.git/clone" did not exist on "4fdd6f5cbf877de7c4de33086fe41bb0ac1d3cf3"
Unverified
Commit
a7f65c2b
authored
Aug 28, 2024
by
youkaichao
Committed by
GitHub
Aug 28, 2024
Browse files
[torch.compile] remove reset (#7975)
parent
4289cad3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
15 deletions
+28
-15
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+28
-7
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+0
-4
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+0
-4
No files found.
tests/tpu/test_compilation.py
View file @
a7f65c2b
...
...
@@ -5,6 +5,10 @@ import tempfile
import
depyf
# disable custom dispatcher, let Dynamo takes over
# all the control
os
.
environ
[
'VLLM_DYNAMO_USE_CUSTOM_DISPATCHER'
]
=
"0"
temp_dir
=
tempfile
.
mkdtemp
()
with
depyf
.
prepare_debug
(
temp_dir
):
cur_dir
=
os
.
path
.
dirname
(
__file__
)
...
...
@@ -16,19 +20,36 @@ with depyf.prepare_debug(temp_dir):
compiled_code
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
full_code
=
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"full_code*.py"
))[
0
]
# we should only trigger Dynamo compilation three times:
# one for the profiling phase
(and the compiled artifact will be discarded)
# one for the profiling phase
without kv cache
# one for the prefill phase with symbolic shapes
# one for the decode phase with symbolic shapes
# and later calls should not trigger Dynamo compilation again.
# NOTE: it might still trigger XLA compilation.
# check we have three compiled code
# this is the assumption when we use the custom dispatcher
assert
len
(
compiled_code
)
==
3
# check the first compilation is discarded
with
open
(
full_code
)
as
f
:
full_code_content
=
f
.
read
()
profile_function
=
compiled_code
[
0
].
split
(
"."
)[
0
]
assert
profile_function
not
in
full_code_content
# check all the compilations are as expected
compiled_fn
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
# the first compilation is the profiling phase,
# it should not have any kv cache
with
open
(
compiled_fn
[
0
])
as
f
:
content
=
f
.
read
()
assert
"kv_caches"
not
in
content
# the second compilation is the prefill phase,
# it should have kv cache and the flash_attention op
with
open
(
compiled_fn
[
1
])
as
f
:
content
=
f
.
read
()
assert
"kv_caches"
in
content
and
"torch.ops.xla.flash_attention"
in
content
# the third compilation is the decode phase,
# it should have kv cache and the paged_attention op
with
open
(
compiled_fn
[
2
])
as
f
:
content
=
f
.
read
()
assert
"kv_caches"
in
content
and
"torch.ops.xla.paged_attention"
in
content
vllm/worker/model_runner.py
View file @
a7f65c2b
...
...
@@ -1123,10 +1123,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
device
=
self
.
device
)
self
.
execute_model
(
model_input
,
kv_caches
,
intermediate_tensors
)
torch
.
cuda
.
synchronize
()
# reset and discard the guard and compiled bytecode for profiling runs
torch
.
_dynamo
.
reset
()
return
def
remove_all_loras
(
self
):
...
...
vllm/worker/tpu_worker.py
View file @
a7f65c2b
...
...
@@ -143,10 +143,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
num_cpu_blocks
=
int
(
self
.
cache_config
.
swap_space_bytes
//
block_size_bytes
)
num_cpu_blocks
=
(
num_cpu_blocks
//
8
)
*
8
# Round down to 8.
# reset and discard the guard and compiled bytecode for profiling runs
torch
.
_dynamo
.
reset
()
return
num_tpu_blocks
,
num_cpu_blocks
def
initialize_cache
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment