Unverified Commit 23fdab00 authored by Siyuan Liu's avatar Siyuan Liu Committed by GitHub
Browse files

[Hardware][TPU] Skip failed compilation test (#15421)


Signed-off-by: default avatarSiyuan Liu <lsiyuan@google.com>
parent 623e2ed2
...@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \ ...@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
&& export VLLM_USE_V1=1 \ && export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \ && export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo TEST_1 \ && echo TEST_1 \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \ && pytest /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \ && echo TEST_2 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \ && echo TEST_3 \
......
...@@ -5,92 +5,96 @@ import os ...@@ -5,92 +5,96 @@ import os
import tempfile import tempfile
import depyf import depyf
import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
temp_dir = tempfile.mkdtemp()
with depyf.prepare_debug(temp_dir): @pytest.mark.skip(reason="Not working; needs investigation.")
from vllm import LLM, SamplingParams def test_tpu_compilation():
temp_dir = tempfile.mkdtemp()
prompts = [ with depyf.prepare_debug(temp_dir):
"A robot may not injure a human being", from vllm import LLM, SamplingParams
"It is only with the heart that one can see rightly;",
"The greatest glory in living lies not in never falling,", prompts = [
] "A robot may not injure a human being",
answers = [ "It is only with the heart that one can see rightly;",
" or, through inaction, allow a human being to come to harm.", "The greatest glory in living lies not in never falling,",
" what is essential is invisible to the eye.", ]
" but in rising every time we fall.", answers = [
] " or, through inaction, allow a human being to come to harm.",
N = 1 " what is essential is invisible to the eye.",
# Currently, top-p sampling is disabled. `top_p` should be 1.0. " but in rising every time we fall.",
sampling_params = SamplingParams(temperature=0.7, ]
top_p=1.0, N = 1
n=N, # Currently, top-p sampling is disabled. `top_p` should be 1.0.
max_tokens=16) sampling_params = SamplingParams(temperature=0.7,
top_p=1.0,
# Set `enforce_eager=True` to avoid ahead-of-time compilation. n=N,
# In real workloads, `enforace_eager` should be `False`. max_tokens=16)
# disable custom dispatcher, let Dynamo takes over # Set `enforce_eager=True` to avoid ahead-of-time compilation.
# all the control # In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=512, # disable custom dispatcher, let Dynamo takes over
max_num_seqs=64, # all the control
enforce_eager=True, llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS}) max_model_len=512,
outputs = llm.generate(prompts, sampling_params) max_num_seqs=64,
for output, answer in zip(outputs, answers): enforce_eager=True,
prompt = output.prompt compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
generated_text = output.outputs[0].text outputs = llm.generate(prompts, sampling_params)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for output, answer in zip(outputs, answers):
assert generated_text.startswith(answer) prompt = output.prompt
generated_text = output.outputs[0].text
compiled_codes = sorted( print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
glob.glob(os.path.join(temp_dir, "__transformed_code*.py"))) assert generated_text.startswith(answer)
for i, compiled_code in enumerate(compiled_codes): compiled_codes = sorted(
print("{} file: {}".format(i + 1, compiled_code)) glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
# We should only trigger Dynamo compilation 4 times: for i, compiled_code in enumerate(compiled_codes):
# 1. forward pass (symbolic) print("{} file: {}".format(i + 1, compiled_code))
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16) # We should only trigger Dynamo compilation 4 times:
# 4. forward pass (shape 32) # 1. forward pass (symbolic)
# and later calls should not trigger Dynamo compilation again. # 2. compute_logits (symbolic)
# NOTE: It might still trigger XLA compilation. # 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# Check we have 4 compiled codes # and later calls should not trigger Dynamo compilation again.
assert len(compiled_codes) == 4 # NOTE: It might still trigger XLA compilation.
kv_cache_prefix = "kv_cache" # Check we have 4 compiled codes
attn_prefix = "ragged_paged_attention" assert len(compiled_codes) == 4
# Check all the compilations are as expected kv_cache_prefix = "kv_cache"
compiled_fns = sorted( attn_prefix = "ragged_paged_attention"
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
# Check all the compilations are as expected
for i, compiled_fn in enumerate(compiled_fns): compiled_fns = sorted(
print("{} file: {}".format(i + 1, compiled_fn)) glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
# The first compilation is symbolic, so it should not have any kv_caches for i, compiled_fn in enumerate(compiled_fns):
with open(compiled_fns[0]) as f: print("{} file: {}".format(i + 1, compiled_fn))
content = f.read()
assert kv_cache_prefix not in content # The first compilation is symbolic, so it should not have any kv_caches
with open(compiled_fns[0]) as f:
# The second compilation is symbolic, so it should not have any kv_caches content = f.read()
with open(compiled_fns[1]) as f: assert kv_cache_prefix not in content
content = f.read()
assert kv_cache_prefix not in content # The second compilation is symbolic, so it should not have any kv_caches
with open(compiled_fns[1]) as f:
# The third compilation is shape 16, so it should have kv_caches and the content = f.read()
# ragged_paged_attention assert kv_cache_prefix not in content
with open(compiled_fns[2]) as f:
content = f.read() # The third compilation is shape 16, so it should have kv_caches and the
assert (kv_cache_prefix in content and attn_prefix in content) # ragged_paged_attention
with open(compiled_fns[2]) as f:
# The forth compilation is shape 32, so it should have kv_caches and the content = f.read()
# ragged_paged_attention assert (kv_cache_prefix in content and attn_prefix in content)
with open(compiled_fns[3]) as f:
content = f.read() # The forth compilation is shape 32, so it should have kv_caches and the
assert (kv_cache_prefix in content and attn_prefix in content) # ragged_paged_attention
with open(compiled_fns[3]) as f:
content = f.read()
assert (kv_cache_prefix in content and attn_prefix in content)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment