Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
64cc6444
Unverified
Commit
64cc6444
authored
Aug 26, 2024
by
youkaichao
Committed by
GitHub
Aug 26, 2024
Browse files
[core][torch.compile] discard the compile for profiling (#7796)
parent
39178c7f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
43 additions
and
2 deletions
+43
-2
.buildkite/run-tpu-test.sh
.buildkite/run-tpu-test.sh
+1
-2
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+34
-0
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+4
-0
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+4
-0
No files found.
.buildkite/run-tpu-test.sh
View file @
64cc6444
...
...
@@ -12,5 +12,4 @@ remove_docker_container
# For HF_TOKEN.
source
/etc/environment
# Run a simple end-to-end example.
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu
\
python3 /workspace/vllm/examples/offline_inference_tpu.py
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
tests/tpu/test_compilation.py
0 → 100644
View file @
64cc6444
import
glob
import
os
import
runpy
import
tempfile
import
depyf
temp_dir
=
tempfile
.
mkdtemp
()
with
depyf
.
prepare_debug
(
temp_dir
):
cur_dir
=
os
.
path
.
dirname
(
__file__
)
parent_dir
=
os
.
path
.
dirname
(
cur_dir
)
root_dir
=
os
.
path
.
dirname
(
parent_dir
)
example_file
=
os
.
path
.
join
(
root_dir
,
"examples"
,
"offline_inference_tpu.py"
)
runpy
.
run_path
(
example_file
)
compiled_code
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
full_code
=
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"full_code*.py"
))[
0
]
# we should only trigger Dynamo compilation three times:
# one for the profiling phase (and the compiled artifact will be discarded)
# one for the prefill phase with symbolic shapes
# one for the decode phase with symbolic shapes
# and later calls should not trigger Dynamo compilation again.
# NOTE: it might still trigger XLA compilation.
# check we have three compiled code
assert
len
(
compiled_code
)
==
3
# check the first compilation is discarded
with
open
(
full_code
)
as
f
:
full_code_content
=
f
.
read
()
profile_function
=
compiled_code
[
0
].
split
(
"."
)[
0
]
assert
profile_function
not
in
full_code_content
vllm/worker/model_runner.py
View file @
64cc6444
...
...
@@ -1097,6 +1097,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
device
=
self
.
device
)
self
.
execute_model
(
model_input
,
kv_caches
,
intermediate_tensors
)
torch
.
cuda
.
synchronize
()
# reset and discard the guard and compiled bytecode for profiling runs
torch
.
_dynamo
.
reset
()
return
def
remove_all_loras
(
self
):
...
...
vllm/worker/tpu_worker.py
View file @
64cc6444
...
...
@@ -143,6 +143,10 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
num_cpu_blocks
=
int
(
self
.
cache_config
.
swap_space_bytes
//
block_size_bytes
)
num_cpu_blocks
=
(
num_cpu_blocks
//
8
)
*
8
# Round down to 8.
# reset and discard the guard and compiled bytecode for profiling runs
torch
.
_dynamo
.
reset
()
return
num_tpu_blocks
,
num_cpu_blocks
def
initialize_cache
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment