Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7728dd77
Unverified
Commit
7728dd77
authored
Jul 25, 2025
by
QiliangCui
Committed by
GitHub
Jul 26, 2025
Browse files
[TPU][Test] Divide TPU v1 Test into 2 parts. (#21431)
parent
2f6e6b33
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
166 additions
and
12 deletions
+166
-12
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+166
-0
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+0
-12
No files found.
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
0 → 100755
View file @
7728dd77
#!/bin/bash
set
-xu
remove_docker_container
()
{
docker
rm
-f
tpu-test
||
true
;
docker
rm
-f
vllm-tpu
||
true
;
}
trap
remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build
-f
docker/Dockerfile.tpu
-t
vllm-tpu
.
# Set up cleanup.
cleanup_docker
()
{
# Get Docker's root directory
docker_root
=
$(
docker info
-f
'{{.DockerRootDir}}'
)
if
[
-z
"
$docker_root
"
]
;
then
echo
"Failed to determine Docker root directory."
exit
1
fi
echo
"Docker root directory:
$docker_root
"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage
=
$(
df
"
$docker_root
"
|
tail
-1
|
awk
'{print $5}'
|
sed
's/%//'
)
# Define the threshold
threshold
=
70
if
[
"
$disk_usage
"
-gt
"
$threshold
"
]
;
then
echo
"Disk usage is above
$threshold
%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune
-f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune
-f
&&
docker system prune
--force
--filter
"until=72h"
--all
echo
"Docker images and volumes cleanup completed."
else
echo
"Disk usage is below
$threshold
%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source
/etc/environment
docker run
--privileged
--net
host
--shm-size
=
16G
-it
\
-e
"HF_TOKEN=
$HF_TOKEN
"
--name
tpu-test
\
vllm-tpu /bin/bash
-c
'
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 4 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
'
# IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE
=
$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if
[
"
$DOCKER_RUN_EXIT_CODE
"
-ne
0
]
;
then
echo
"Docker run command failed with exit code
$DOCKER_RUN_EXIT_CODE
."
exit
"
$DOCKER_RUN_EXIT_CODE
"
else
echo
"Docker run command completed successfully."
exit
0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
View file @
7728dd77
...
...
@@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment