Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
23fdab00
Unverified
Commit
23fdab00
authored
Mar 24, 2025
by
Siyuan Liu
Committed by
GitHub
Mar 24, 2025
Browse files
[Hardware][TPU] Skip failed compilation test (#15421)
Signed-off-by:
Siyuan Liu
<
lsiyuan@google.com
>
parent
623e2ed2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
91 additions
and
87 deletions
+91
-87
.buildkite/run-tpu-v1-test.sh
.buildkite/run-tpu-v1-test.sh
+1
-1
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+90
-86
No files found.
.buildkite/run-tpu-v1-test.sh
View file @
23fdab00
...
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
...
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
&& export VLLM_USE_V1=1
\
&& export VLLM_USE_V1=1
\
&& export VLLM_XLA_CHECK_RECOMPILATION=1
\
&& export VLLM_XLA_CHECK_RECOMPILATION=1
\
&& echo TEST_1
\
&& echo TEST_1
\
&& pyt
hon3
/workspace/vllm/tests/tpu/test_compilation.py
\
&& pyt
est
/workspace/vllm/tests/tpu/test_compilation.py
\
&& echo TEST_2
\
&& echo TEST_2
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py
\
&& echo TEST_3
\
&& echo TEST_3
\
...
...
tests/tpu/test_compilation.py
View file @
23fdab00
...
@@ -5,92 +5,96 @@ import os
...
@@ -5,92 +5,96 @@ import os
import
tempfile
import
tempfile
import
depyf
import
depyf
import
pytest
from
vllm.config
import
CompilationLevel
from
vllm.config
import
CompilationLevel
temp_dir
=
tempfile
.
mkdtemp
()
with
depyf
.
prepare_debug
(
temp_dir
):
@
pytest
.
mark
.
skip
(
reason
=
"Not working; needs investigation."
)
from
vllm
import
LLM
,
SamplingParams
def
test_tpu_compilation
():
temp_dir
=
tempfile
.
mkdtemp
()
prompts
=
[
with
depyf
.
prepare_debug
(
temp_dir
):
"A robot may not injure a human being"
,
from
vllm
import
LLM
,
SamplingParams
"It is only with the heart that one can see rightly;"
,
"The greatest glory in living lies not in never falling,"
,
prompts
=
[
]
"A robot may not injure a human being"
,
answers
=
[
"It is only with the heart that one can see rightly;"
,
" or, through inaction, allow a human being to come to harm."
,
"The greatest glory in living lies not in never falling,"
,
" what is essential is invisible to the eye."
,
]
" but in rising every time we fall."
,
answers
=
[
]
" or, through inaction, allow a human being to come to harm."
,
N
=
1
" what is essential is invisible to the eye."
,
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
" but in rising every time we fall."
,
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
]
top_p
=
1.0
,
N
=
1
n
=
N
,
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
max_tokens
=
16
)
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
1.0
,
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
n
=
N
,
# In real workloads, `enforace_eager` should be `False`.
max_tokens
=
16
)
# disable custom dispatcher, let Dynamo takes over
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# all the control
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
512
,
# disable custom dispatcher, let Dynamo takes over
max_num_seqs
=
64
,
# all the control
enforce_eager
=
True
,
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
max_model_len
=
512
,
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
max_num_seqs
=
64
,
for
output
,
answer
in
zip
(
outputs
,
answers
):
enforce_eager
=
True
,
prompt
=
output
.
prompt
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
generated_text
=
output
.
outputs
[
0
].
text
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
assert
generated_text
.
startswith
(
answer
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
compiled_codes
=
sorted
(
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
assert
generated_text
.
startswith
(
answer
)
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
compiled_codes
=
sorted
(
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
# We should only trigger Dynamo compilation 4 times:
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
# 1. forward pass (symbolic)
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# We should only trigger Dynamo compilation 4 times:
# 4. forward pass (shape 32)
# 1. forward pass (symbolic)
# and later calls should not trigger Dynamo compilation again.
# 2. compute_logits (symbolic)
# NOTE: It might still trigger XLA compilation.
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# Check we have 4 compiled codes
# and later calls should not trigger Dynamo compilation again.
assert
len
(
compiled_codes
)
==
4
# NOTE: It might still trigger XLA compilation.
kv_cache_prefix
=
"kv_cache"
# Check we have 4 compiled codes
attn_prefix
=
"ragged_paged_attention"
assert
len
(
compiled_codes
)
==
4
# Check all the compilations are as expected
kv_cache_prefix
=
"kv_cache"
compiled_fns
=
sorted
(
attn_prefix
=
"ragged_paged_attention"
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
# Check all the compilations are as expected
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
compiled_fns
=
sorted
(
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
# The first compilation is symbolic, so it should not have any kv_caches
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
with
open
(
compiled_fns
[
0
])
as
f
:
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The first compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
0
])
as
f
:
# The second compilation is symbolic, so it should not have any kv_caches
content
=
f
.
read
()
with
open
(
compiled_fns
[
1
])
as
f
:
assert
kv_cache_prefix
not
in
content
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The second compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
1
])
as
f
:
# The third compilation is shape 16, so it should have kv_caches and the
content
=
f
.
read
()
# ragged_paged_attention
assert
kv_cache_prefix
not
in
content
with
open
(
compiled_fns
[
2
])
as
f
:
content
=
f
.
read
()
# The third compilation is shape 16, so it should have kv_caches and the
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
# ragged_paged_attention
with
open
(
compiled_fns
[
2
])
as
f
:
# The forth compilation is shape 32, so it should have kv_caches and the
content
=
f
.
read
()
# ragged_paged_attention
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
with
open
(
compiled_fns
[
3
])
as
f
:
content
=
f
.
read
()
# The forth compilation is shape 32, so it should have kv_caches and the
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
# ragged_paged_attention
with
open
(
compiled_fns
[
3
])
as
f
:
content
=
f
.
read
()
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment