[CI] Basic Integration Test For TPU (#9968)

Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>

[CI] Basic Integration Test For TPU (#9968)
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
1c45f4c3 · Robert Shaw · GitHub · 603a661a · 1c45f4c3 · 1c45f4c3
Unverified Commit 1c45f4c3 authored Nov 04, 2024 by Robert Shaw Committed by GitHub Nov 04, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 3 deletions

.buildkite/run-tpu-test.sh .buildkite/run-tpu-test.sh +1 -1

tests/entrypoints/openai/test_accuracy.py tests/entrypoints/openai/test_accuracy.py +15 -2

No files found.
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -12,4 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -10,6 +10,8 @@ AsyncLLMEngine are working correctly.
 import lm_eval
 import pytest
+from vllm.platforms import current_platform
 from ...utils import RemoteOpenAIServer
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
@@ -18,12 +20,21 @@ TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
 MORE_ARGS_LIST = [
+    [],  # Default
    ["--enable-chunked-prefill"],  # Chunked
    ["--num-scheduler-steps", "8"],  # MS
    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
+MAX_WAIT_SECONDS = None
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
+    ]
+    MAX_WAIT_SECONDS = 600
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
@@ -33,7 +44,9 @@ def test_lm_eval_accuracy(more_args):
    print(f"Running with: {args}")
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(
+            MODEL_NAME, args,
+            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
        url = f"{remote_server.url_for('v1')}/completions"
        model_args = (