Disable enforce_eager for V1 TPU sampler and structured output tests (#17016)

Signed-off-by: mgoin <mgoin64@gmail.com>

Disable enforce_eager for V1 TPU sampler and structured output tests (#17016)
Signed-off-by: mgoin <mgoin64@gmail.com>
14288d13 · Michael Goin · GitHub · b411418f · 14288d13 · 14288d13
Unverified Commit 14288d13 authored Apr 24, 2025 by Michael Goin Committed by GitHub Apr 24, 2025
3 changed files
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest pytest-asyncio tpu-info \
    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_XLA_CACHE_PATH= \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \

--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -13,6 +13,7 @@ from pydantic import BaseModel
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
@@ -63,10 +64,13 @@ def test_structured_output(
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
    # Use a single LLM instance for several scenarios to
    # speed up the test suite.
    llm = LLM(model=model_name,
-              enforce_eager=True,
+              enforce_eager=enforce_eager,
              max_model_len=1024,
              guided_decoding_backend=guided_decoding_backend,
              tokenizer_mode=tokenizer_mode)

--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
    different results.
    """
    llm = LLM(model_name,
-              enforce_eager=True,
+              enforce_eager=False,
              max_num_seqs=1,
              max_model_len=512,
              max_num_batched_tokens=512)