Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
1000
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
953 additions
and
224 deletions
+953
-224
.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
...dkite/performance-benchmarks/tests/serving-tests-hpu.json
+78
-3
.buildkite/performance-benchmarks/tests/serving-tests.json
.buildkite/performance-benchmarks/tests/serving-tests.json
+0
-4
.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
...te/performance-benchmarks/tests/throughput-tests-hpu.json
+62
-0
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+2
-2
.buildkite/scripts/annotate-rocm-release.sh
.buildkite/scripts/annotate-rocm-release.sh
+3
-3
.buildkite/scripts/cache-rocm-base-wheels.sh
.buildkite/scripts/cache-rocm-base-wheels.sh
+3
-3
.buildkite/scripts/check-ray-compatibility.sh
.buildkite/scripts/check-ray-compatibility.sh
+213
-0
.buildkite/scripts/cherry-pick-from-milestone.sh
.buildkite/scripts/cherry-pick-from-milestone.sh
+1
-1
.buildkite/scripts/hardware_ci/run-amd-test.sh
.buildkite/scripts/hardware_ci/run-amd-test.sh
+441
-160
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+65
-0
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
...ite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+21
-4
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+1
-1
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+2
-2
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+2
-2
.buildkite/scripts/hardware_ci/run-gh200-test.sh
.buildkite/scripts/hardware_ci/run-gh200-test.sh
+1
-1
.buildkite/scripts/hardware_ci/run-hpu-test.sh
.buildkite/scripts/hardware_ci/run-hpu-test.sh
+30
-5
.buildkite/scripts/hardware_ci/run-npu-test.sh
.buildkite/scripts/hardware_ci/run-npu-test.sh
+15
-20
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+1
-1
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+1
-1
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+11
-11
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
View file @
3fb4b5fa
...
...
@@ -10,7 +10,6 @@
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
...
...
@@ -37,7 +36,6 @@
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
...
...
@@ -64,7 +62,6 @@
"server_parameters"
:
{
"model"
:
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
"tensor_parallel_size"
:
2
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
...
...
@@ -78,5 +75,83 @@
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_deepseek_r1"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"server_parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
200
,
"async-scheduling"
:
""
,
"dtype"
:
"bfloat16"
},
"client_parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama4_maverick_17b128e_instruct_fp8"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"disable_log_stats"
:
""
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
128
,
"async-scheduling"
:
""
,
"enable_expert_parallel"
:
""
,
"max-num-batched-tokens"
:
4096
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_qwen3_8b"
,
"qps_list"
:
[
1
,
4
,
10
,
"inf"
],
"server_environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"server_parameters"
:
{
"model"
:
"Qwen/Qwen-3-8B"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"disable_log_stats"
:
""
,
"async-scheduling"
:
""
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen-3-8B"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
}
]
.buildkite/performance-benchmarks/tests/serving-tests.json
View file @
3fb4b5fa
...
...
@@ -5,7 +5,6 @@
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
},
...
...
@@ -23,7 +22,6 @@
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
},
...
...
@@ -41,7 +39,6 @@
"server_parameters"
:
{
"model"
:
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
"tensor_parallel_size"
:
2
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
},
...
...
@@ -59,7 +56,6 @@
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"speculative_config"
:
{
"model"
:
"turboderp/Qwama-0.5B-Instruct"
,
"num_speculative_tokens"
:
4
,
...
...
.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
View file @
3fb4b5fa
...
...
@@ -57,5 +57,67 @@
"max-num-seqs"
:
512
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"throughput_deepseek_r1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"load_format"
:
"dummy"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_name"
:
"sharegpt"
,
"num_prompts"
:
1000
,
"backend"
:
"vllm"
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
384
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"throughput_llama4_maverick_17b128e_instruct_fp8"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_name"
:
"sharegpt"
,
"num_prompts"
:
1000
,
"backend"
:
"vllm"
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
512
,
"async-scheduling"
:
""
,
"enable_expert_parallel"
:
""
}
},
{
"test_name"
:
"throughput_qwen3_8b"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"Qwen/Qwen-3-8B"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_name"
:
"sharegpt"
,
"num_prompts"
:
1000
,
"max-num-seqs"
:
512
,
"backend"
:
"vllm"
,
"async-scheduling"
:
""
}
}
]
.buildkite/release-pipeline.yaml
View file @
3fb4b5fa
...
...
@@ -83,7 +83,7 @@ steps:
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_
AVX512BF16=true
--build-arg
VLLM_CPU_AVX512VNNI=true
--build-arg
VLLM_CPU_AMXBF1
6=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_
X8
6=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-nightly-wheels.sh
manylinux_2_35"
...
...
@@ -152,7 +152,7 @@ steps:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_
AVX512BF16=true
--build-arg
VLLM_CPU_AVX512VNNI=true
--build-arg
VLLM_CPU_AMXBF1
6=true
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_
X8
6=true
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)"
env
:
...
...
.buildkite/scripts/annotate-rocm-release.sh
View file @
3fb4b5fa
...
...
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL
=
"http://
${
S3_BUCKET
}
.s3-website-
${
S3_REGION
}
.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH
=
"rocm
$(
echo
${
ROCM_VERSION
}
|
tr
-d
'.'
)
"
ROCM_VERSION_PATH
=
"rocm
$(
echo
"
${
ROCM_VERSION
}
"
|
tr
-d
'.'
)
"
ROCM_PATH
=
"rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
"
buildkite-agent annotate
--style
'success'
--context
'rocm-release-workflow'
<<
EOF
## ROCm Wheel and Docker Image Releases
...
...
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/torchvision-*.whl .
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/torchaudio-*.whl .
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/amdsmi-*.whl .
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/aiter-*.whl .
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/
amd_
aiter-*.whl .
aws s3 cp s3://
${
S3_BUCKET
}
/rocm/
${
BUILDKITE_COMMIT
}
/
${
ROCM_VERSION_PATH
}
/flash-attn-*.whl .
\`\`\`
...
...
@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
- **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
- **
amd_
aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
### :warning: Notes
...
...
.buildkite/scripts/cache-rocm-base-wheels.sh
View file @
3fb4b5fa
...
...
@@ -83,7 +83,7 @@ case "${1:-}" in
exit
1
fi
WHEEL_COUNT
=
$(
ls
artifacts/rocm-base-wheels
/
*
.whl 2>/dev/null |
wc
-l
)
WHEEL_COUNT
=
$(
find
artifacts/rocm-base-wheels
-maxdepth
1
-name
'
*.whl
'
2>/dev/null |
wc
-l
)
if
[[
"
$WHEEL_COUNT
"
-eq
0
]]
;
then
echo
"ERROR: No wheels found in artifacts/rocm-base-wheels/"
>
&2
exit
1
...
...
@@ -110,9 +110,9 @@ case "${1:-}" in
echo
""
echo
"Downloaded wheels:"
ls
-lh
artifacts/rocm-base-wheels
/
find
artifacts/rocm-base-wheels
-maxdepth
1
-name
'*.whl'
-exec
ls
-lh
{}
\;
WHEEL_COUNT
=
$(
ls
artifacts/rocm-base-wheels
/
*
.whl 2>/dev/null |
wc
-l
)
WHEEL_COUNT
=
$(
find
artifacts/rocm-base-wheels
-maxdepth
1
-name
'
*.whl
'
2>/dev/null |
wc
-l
)
echo
""
echo
"Total:
$WHEEL_COUNT
wheels"
echo
"========================================"
...
...
.buildkite/scripts/check-ray-compatibility.sh
0 → 100644
View file @
3fb4b5fa
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Check if Ray LLM can generate lock files that are compatible with this
# version of vllm. Downloads Ray's requirement files and runs a full
# dependency resolution with the installed vllm's constraints to see if
# a valid lock file can be produced.
#
# See: https://github.com/vllm-project/vllm/issues/33599
set
-eo
pipefail
RAY_BASE_URL
=
"https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR
=
$(
mktemp
-d
)
trap
'rm -rf "$WORK_DIR"'
EXIT
# Fetch all Ray requirement files used in the LLM depset pipeline
echo
">>> Fetching Ray requirement files"
RAY_FILES
=(
"requirements.txt"
"requirements/cloud-requirements.txt"
"requirements/base-test-requirements.txt"
"requirements/llm/llm-requirements.txt"
"requirements/llm/llm-test-requirements.txt"
)
for
FILE
in
"
${
RAY_FILES
[@]
}
"
;
do
LOCAL_PATH
=
"
${
WORK_DIR
}
/
$(
basename
"
$FILE
"
)
"
echo
"
${
FILE
}
"
curl
-fsSL
-o
"
$LOCAL_PATH
"
"
${
RAY_BASE_URL
}
/
${
FILE
}
"
done
# Extract installed vllm deps
echo
">>> Extracting installed vllm dependency constraints"
python3 -
"
${
WORK_DIR
}
/vllm-constraints.txt"
<<
'
PYEOF
'
"""Write out the installed vllm's dependencies as pip constraint lines.
Ray uses vllm[audio], so audio-extra deps are included with their extra
markers stripped. The resolver cannot evaluate extra markers for a
package that is not itself being resolved from an index, so we activate
them manually here.
"""
import importlib.metadata
import re
import sys
out_path = sys.argv[1]
raw_reqs = importlib.metadata.requires("vllm") or []
# Ray uses vllm[audio] – activate that extra.
ACTIVE_EXTRAS = {"audio"}
EXTRA_RE = re.compile(r"""extra
\s
*==
\s
*['"]([^'"]+)['"]""")
lines = []
for r in raw_reqs:
if ";" not in r:
# Unconditional dep — always include.
lines.append(r.strip())
continue
req_part, _, marker_part = r.partition(";")
marker_part = marker_part.strip()
extra_matches = EXTRA_RE.findall(marker_part)
if not extra_matches:
# Non-extra marker (python_version, etc.) — keep as-is.
lines.append(r.strip())
continue
if not ACTIVE_EXTRAS.intersection(extra_matches):
continue # Skip inactive extras (tensorizer, bench, …).
# Strip the extra== conditions but keep any remaining markers
# (e.g. python_version).
cleaned = EXTRA_RE.sub("", marker_part)
cleaned = re.sub(r"
\b
and
\b\s
*
\b
and
\b
", "and", cleaned)
cleaned = re.sub(r"^
\s
*and
\s
+|
\s
+and
\s
*
$"
, "", cleaned).strip()
if cleaned:
lines.append(f"{req_part.strip()} ; {cleaned}")
else:
lines.append(req_part.strip())
with open(out_path, "w") as f:
for line in lines:
f.write(line + "
\n
")
print(f"Wrote {len(lines)} constraints to {out_path}")
PYEOF
echo
">>> Installed vllm deps (first 20 lines):"
head
-20
"
${
WORK_DIR
}
/vllm-constraints.txt"
# Remove Ray's vllm pin — the installed vllm's transitive deps
# (written above) replace it in the resolution. vllm itself cannot
# be resolved from PyPI for in-development versions, so we test
# whether Ray's requirements can coexist with vllm's dependency
# constraints instead.
sed
-i
'/^vllm/d'
"
${
WORK_DIR
}
/llm-requirements.txt"
# Install uv if needed
if
!
command
-v
uv &>/dev/null
;
then
echo
">>> Installing uv"
pip
install
uv
-q
fi
# Resolve: given vllm's constraints, can Ray compile a lock file?
#
# vllm's dependency constraints are the fixed side — Ray is flexible and
# can regenerate its lock files. We pass vllm's constraints via -c so
# the resolver treats them as non-negotiable bounds, then check whether
# Ray's own requirements can still be satisfied within those bounds.
echo
""
echo
"============================================================"
echo
">>> Resolving: Can Ray generate compatible lock files?"
echo
"============================================================"
set
+e
uv pip compile
\
"
${
WORK_DIR
}
/requirements.txt"
\
"
${
WORK_DIR
}
/cloud-requirements.txt"
\
"
${
WORK_DIR
}
/base-test-requirements.txt"
\
"
${
WORK_DIR
}
/llm-requirements.txt"
\
"
${
WORK_DIR
}
/llm-test-requirements.txt"
\
-c
"
${
WORK_DIR
}
/vllm-constraints.txt"
\
--python-version
3.12
\
--python-platform
x86_64-manylinux_2_31
\
--extra-index-url
https://download.pytorch.org/whl/cu129
\
--index-strategy
unsafe-best-match
\
--unsafe-package
setuptools
\
--unsafe-package
ray
\
--no-header
\
-o
"
${
WORK_DIR
}
/resolved.txt"
\
2>&1
EXIT_CODE
=
$?
set
-e
echo
""
echo
"=========================================="
if
[
$EXIT_CODE
-eq
0
]
;
then
echo
"SUCCESS: Ray can generate lock files compatible with this vllm."
echo
""
echo
"Key resolved versions:"
grep
-E
'^(protobuf|torch|numpy|transformers)=='
\
"
${
WORK_DIR
}
/resolved.txt"
|
sort
||
true
echo
"=========================================="
exit
0
fi
echo
"FAILURE: Ray cannot generate lock files compatible with this vllm."
echo
"This means a fundamental dependency conflict exists that Ray"
echo
"cannot resolve by regenerating its lock files."
echo
"See: https://github.com/vllm-project/vllm/issues/33599"
echo
"=========================================="
# Buildkite annotation
if
[
-f
/usr/bin/buildkite-agent
]
;
then
buildkite-agent annotate
--style
'warning'
--context
'ray-compat'
<<
EOF
### :warning: Ray Dependency Compatibility Warning
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
Ray would not be able to regenerate its lock files to accommodate this vllm version.
Please check the **Ray Dependency Compatibility Check** step logs for details.
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
EOF
fi
# Notify Slack if webhook is configured and PR/branch are valid.
if
[
-n
"
$RAY_COMPAT_SLACK_WEBHOOK_URL
"
]
;
then
PR
=
"
${
BUILDKITE_PULL_REQUEST
:-}
"
BRANCH
=
"
${
BUILDKITE_BRANCH
:-}
"
# Skip notification if PR is invalid or branch is empty
if
[[
"
$PR
"
=
"false"
||
-z
"
$PR
"
||
-z
"
$BRANCH
"
]]
;
then
echo
">>> Skipping Slack notification (invalid PR or empty branch: PR=
$PR
, branch=
$BRANCH
)"
else
echo
">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD
=
$(
python3
-c
'
import json, os, sys
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
url = os.getenv("BUILDKITE_BUILD_URL", "#")
data = {
"text": ":warning: Ray Dependency Compatibility Check Failed",
"blocks": [{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
"*:warning: Ray Dependency Compatibility Check Failed*\n"
f"PR #{pr} on branch `{branch}` introduces dependencies "
f"that cannot be resolved with Ray'
\'
's requirements.\n"
f"<{url}|View Build>"
),
},
}],
}
print(json.dumps(data))
'
)
HTTP_CODE
=
$(
curl
-s
-o
/dev/null
-w
"%{http_code}"
-X
POST
"
$RAY_COMPAT_SLACK_WEBHOOK_URL
"
\
-H
'Content-type: application/json'
\
-d
"
$PAYLOAD
"
)
echo
" Slack webhook response:
$HTTP_CODE
"
fi
else
echo
">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
fi
exit
1
.buildkite/scripts/cherry-pick-from-milestone.sh
View file @
3fb4b5fa
...
...
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file
PR_DATA
=
$(
mktemp
)
trap
"
rm -f
$PR_DATA
"
EXIT
trap
'
rm -f
"
$PR_DATA"
'
EXIT
if
!
gh
pr
list
--state
merged
--search
"milestone:
${
MILESTONE
}
"
\
--limit
1000
\
...
...
.buildkite/scripts/hardware_ci/run-amd-test.sh
View file @
3fb4b5fa
This diff is collapsed.
Click to expand it.
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
0 → 100755
View file @
3fb4b5fa
#!/bin/bash
set
-euox
pipefail
export
VLLM_CPU_KVCACHE_SPACE
=
1
export
VLLM_CPU_CI_ENV
=
1
# Reduce sub-processes for acceleration
export
TORCH_COMPILE_DISABLE
=
1
export
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
SDE_ARCHIVE
=
"sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM
=
"CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget
"https://downloadmirror.intel.com/913594/
${
SDE_ARCHIVE
}
"
echo
"
${
SDE_CHECKSUM
}
${
SDE_ARCHIVE
}
"
|
sha256sum
--check
mkdir
-p
sde
tar
-xvf
"./
${
SDE_ARCHIVE
}
"
--strip-components
=
1
-C
./sde/
wait_for_pid_and_check_log
()
{
local
pid
=
"
$1
"
local
log_file
=
"
$2
"
local
exit_status
if
[
-z
"
$pid
"
]
||
[
-z
"
$log_file
"
]
;
then
echo
"Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return
1
fi
echo
"Waiting for process
$pid
to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if
wait
"
$pid
"
;
then
exit_status
=
$?
echo
"Process
$pid
finished with exit status
$exit_status
(Success)."
else
exit_status
=
$?
echo
"Process
$pid
finished with exit status
$exit_status
(Failure)."
fi
if
[
"
$exit_status
"
-ne
0
]
;
then
echo
"Process exited with a non-zero status."
echo
"--- Last few lines of log file:
$log_file
---"
tail
-n
50
"
$log_file
"
echo
"---------------------------------------------"
return
1
# Indicate failure based on exit status
fi
echo
"No errors detected in log file and process exited successfully."
return
0
}
# Test Sky Lake (AVX512F)
./sde/sde64
-skl
--
python3 examples/basic/offline_inference/generate.py
--model
facebook/opt-125m
--dtype
bfloat16
>
test_0.log 2>&1 &
PID_TEST_0
=
$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64
-clx
--
python3 examples/basic/offline_inference/generate.py
--model
facebook/opt-125m
--dtype
bfloat16
>
test_1.log 2>&1 &
PID_TEST_1
=
$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64
-cpx
--
python3 examples/basic/offline_inference/generate.py
--model
facebook/opt-125m
--dtype
bfloat16
>
test_2.log 2>&1 &
PID_TEST_2
=
$!
wait_for_pid_and_check_log
$PID_TEST_0
test_0.log
wait_for_pid_and_check_log
$PID_TEST_1
test_1.log
wait_for_pid_and_check_log
$PID_TEST_2
test_2.log
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
View file @
3fb4b5fa
#!/bin/bash
set
-euox
pipefail
export
VLLM_CPU_CI_ENV
=
0
echo
"--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-pp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
timeout
600 bash
-c
"until curl localhost:8000/v1/models
> /dev/null 2>&1
; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--result-dir
./test_results
\
--result-filename
tp_pp.json
\
--save-result
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
failed_req
=
$(
jq
'.failed'
./test_results/tp_pp.json
)
if
[
"
$failed_req
"
-ne
0
]
;
then
echo
"Some requests were failed!"
exit
1
fi
echo
"--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-dp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
timeout
600 bash
-c
"until curl localhost:8000/v1/models
> /dev/null 2>&1
; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--result-dir
./test_results
\
--result-filename
dp_pp.json
\
--save-result
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
failed_req
=
$(
jq
'.failed'
./test_results/dp_pp.json
)
if
[
"
$failed_req
"
-ne
0
]
;
then
echo
"Some requests were failed!"
exit
1
fi
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
View file @
3fb4b5fa
...
...
@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference
docker
exec
cpu-test bash
-c
"
set -e
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m"
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m"
# Run model tests
docker
exec
cpu-test bash
-c
"
...
...
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
View file @
3fb4b5fa
...
...
@@ -27,7 +27,7 @@ function cpu_tests() {
podman
exec
-it
"
$container_id
"
bash
-c
"
export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m"
>>
$HOME
/test_basic.log
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m"
>>
"
$HOME
"
/test_basic.log
# Run basic model test
podman
exec
-it
"
$container_id
"
bash
-c
"
...
...
@@ -43,7 +43,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
>>
$HOME
/test_rest.log
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
>>
"
$HOME
"
/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
...
...
.buildkite/scripts/hardware_ci/run-cpu-test.sh
View file @
3fb4b5fa
...
...
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build
--progress
plain
--tag
"
$IMAGE_NAME
"
--target
vllm-test
-f
docker/Dockerfile.cpu
.
# Run the image, setting --shm-size=4g for tensor parallel.
docker run
--rm
--cpuset-cpus
=
$CORE_RANGE
--cpuset-mems
=
$NUMA_NODE
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
-e
HF_TOKEN
-e
VLLM_CPU_KVCACHE_SPACE
=
16
-e
VLLM_CPU_CI_ENV
=
1
-e
VLLM_CPU_SIM_MULTI_NUMA
=
1
--shm-size
=
4g
$IMAGE_NAME
\
timeout
$TIMEOUT_VAL
bash
-c
"set -euox pipefail; echo
\"
--- Print packages
\"
; pip list; echo
\"
--- Running tests
\"
;
${
TEST_COMMAND
}
"
docker run
--rm
--cpuset-cpus
=
"
$CORE_RANGE
"
--cpuset-mems
=
"
$NUMA_NODE
"
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
-e
HF_TOKEN
-e
VLLM_CPU_KVCACHE_SPACE
=
16
-e
VLLM_CPU_CI_ENV
=
1
-e
VLLM_CPU_SIM_MULTI_NUMA
=
1
--shm-size
=
4g
"
$IMAGE_NAME
"
\
timeout
"
$TIMEOUT_VAL
"
bash
-c
"set -euox pipefail; echo
\"
--- Print packages
\"
; pip list; echo
\"
--- Running tests
\"
;
${
TEST_COMMAND
}
"
.buildkite/scripts/hardware_ci/run-gh200-test.sh
View file @
3fb4b5fa
...
...
@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference
docker run
-e
HF_TOKEN
-e
VLLM_WORKER_MULTIPROC_METHOD
=
spawn
-v
/root/.cache/huggingface:/root/.cache/huggingface
--name
gh200-test
--gpus
=
all
--entrypoint
=
""
gh200-test bash
-c
'
python3 examples/offline_inference/
basic/
generate.py --model meta-llama/Llama-3.2-1B
python3 examples/
basic/
offline_inference/generate.py --model meta-llama/Llama-3.2-1B
'
.buildkite/scripts/hardware_ci/run-hpu-test.sh
View file @
3fb4b5fa
#!/bin/bash
# This script build the
C
PU docker image and run the offline inference inside the container.
# This script build
s
the
H
PU docker image and run
s
the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
#
# vllm-gaudi compatibility pinning:
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
# When upstream vllm changes its API, the plugin may break before it has been updated.
# To handle this, the vllm-gaudi repository maintains a file:
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
# The first line of that file controls what version of vllm is used inside the Docker image:
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
# the test to a known-compatible baseline.
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
set
-exuo
pipefail
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
VLLM_COMMUNITY_COMMIT
=
$(
curl
-s
\
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
\
|
head
-1
|
tr
-d
'\n'
)
echo
"Using vllm community commit:
${
VLLM_COMMUNITY_COMMIT
}
"
# Try building the docker image
image_name
=
"hpu/upstream-vllm-ci:
${
BUILDKITE_COMMIT
}
"
container_name
=
"hpu-upstream-vllm-ci-
${
BUILDKITE_COMMIT
}
-container"
cat
<<
EOF
| docker build -t
${
image_name
}
-f - .
cat
<<
EOF
| docker build -t
"
${
image_name
}
"
-f - .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
# the current checkout (the Buildkite CI commit) is used unchanged.
RUN if [ "
${
VLLM_COMMUNITY_COMMIT
}
" != "latest" ]; then
\
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout
${
VLLM_COMMUNITY_COMMIT
}
;
\
fi
WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1
...
...
@@ -39,19 +64,19 @@ EOF
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE
=
1
remove_docker_containers
()
{
docker
rm
-f
${
container_name
}
||
true
;
}
remove_docker_containers
()
{
docker
rm
-f
"
${
container_name
}
"
||
true
;
}
trap
'remove_docker_containers; exit $EXITCODE;'
EXIT
remove_docker_containers
echo
"Running HPU plugin v1 test"
docker run
--rm
--runtime
=
habana
--name
=
${
container_name
}
--network
=
host
\
docker run
--rm
--runtime
=
habana
--name
=
"
${
container_name
}
"
--network
=
host
\
-e
HABANA_VISIBLE_DEVICES
=
all
\
-e
VLLM_SKIP_WARMUP
=
true
\
-e
PT_HPU_ENABLE_LAZY_COLLECTIVES
=
true
\
-e
PT_HPU_LAZY_MODE
=
1
\
"
${
image_name
}
"
\
/bin/bash
-c
'
cd vllm; timeout 120s python -u examples/offline_inference/
basic/
generate.py --model facebook/opt-125m
cd vllm; timeout 120s python -u examples/
basic/
offline_inference/generate.py --model facebook/opt-125m
'
EXITCODE
=
$?
...
...
.buildkite/scripts/hardware_ci/run-npu-test.sh
View file @
3fb4b5fa
...
...
@@ -41,6 +41,7 @@ get_config() {
echo
"Error: file '
${
TEST_RUN_CONFIG_FILE
}
' does not exist in the warehouse"
>
&2
exit
1
fi
# shellcheck source=/dev/null
source
"
${
TEST_RUN_CONFIG_FILE
}
"
echo
"Base docker image name that get from configuration:
${
BASE_IMAGE_NAME
}
"
return
0
...
...
@@ -48,9 +49,8 @@ get_config() {
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if
[
$?
-ne
0
]
;
then
if
!
get_config
;
then
exit
1
fi
...
...
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo
"agent_idx:
${
agent_idx
}
"
builder_name
=
"cachebuilder
${
agent_idx
}
"
builder_cache_dir
=
"/mnt/docker-cache
${
agent_idx
}
"
mkdir
-p
${
builder_cache_dir
}
mkdir
-p
"
${
builder_cache_dir
}
"
# Try building the docker image
cat
<<
EOF
| DOCKER_BUILDKIT=1 docker build
\
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:
${
PYPI_CACHE_HOST
}
\
--builder
${
builder_name
}
--cache-from type=local,src=
${
builder_cache_dir
}
\
--cache-to type=local,dest=
${
builder_cache_dir
}
,mode=max
\
--progress=plain --load -t
${
image_name
}
-f - .
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:
"
${
PYPI_CACHE_HOST
}
"
\
--builder
"
${
builder_name
}
"
--cache-from type=local,src=
"
${
builder_cache_dir
}
"
\
--cache-to type=local,dest=
"
${
builder_cache_dir
}
"
,mode=max
\
--progress=plain --load -t
"
${
image_name
}
"
-f - .
FROM
${
BASE_IMAGE_NAME
}
# Define environments
...
...
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi &&
\
source /usr/local/Ascend/ascend-toolkit/set_env.sh &&
\
source /usr/local/Ascend/nnal/atb/set_env.sh &&
\
export LD_LIBRARY_PATH=
\$
LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/
`
uname -i
`
-linux/devlib &&
\
export LD_LIBRARY_PATH=
\$
LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/
$(
uname
-i
)
-linux/devlib &&
\
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
...
...
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns
--device /dev/davinci0
--device /dev/davinci
1
# returns
one argument per line:
--device
,
/dev/davinci
X, ...
parse_and_gen_devices
()
{
local
input
=
"
$1
"
local
index cards_num
...
...
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
return
1
fi
local
devices
=
""
local
i
=
0
while
((
i < cards_num
))
;
do
local
dev_idx
=
$((
(
index
-
1
)*
cards_num
+
i
))
devices
=
"
$devices
--device /dev/davinci
${
dev_idx
}
"
printf
'%s\n'
"--device"
printf
'%s\n'
"/dev/davinci
${
dev_idx
}
"
((
i++
))
done
# trim leading space
devices
=
"
${
devices
#
"
${
devices
%%[![
:space:]]
*
}
"
}
"
# Output devices: assigned to the caller variable
printf
'%s'
"
$devices
"
}
devices
=
$
(
parse_and_gen_devices
"
${
BUILDKITE_AGENT_NAME
}
"
)
||
exit
1
mapfile
-t
device_args < <
(
parse_and_gen_devices
"
${
BUILDKITE_AGENT_NAME
}
"
)
||
exit
1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir
=
/mnt/modelscope
${
agent_idx
}
mkdir
-p
${
model_cache_dir
}
mkdir
-p
"
${
model_cache_dir
}
"
docker run
\
${
device
s
}
\
"
${
device
_args
[@]
}
"
\
--device
/dev/davinci_manager
\
--device
/dev/devmm_svm
\
--device
/dev/hisi_hdc
\
...
...
@@ -182,7 +177,7 @@ docker run \
-v
/usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
\
-v
/usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
\
-v
/etc/ascend_install.info:/etc/ascend_install.info
\
-v
${
model_cache_dir
}
:/root/.cache/modelscope
\
-v
"
${
model_cache_dir
}
"
:/root/.cache/modelscope
\
--entrypoint
=
""
\
--name
"
${
container_name
}
"
\
"
${
image_name
}
"
\
...
...
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
View file @
3fb4b5fa
...
...
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.
9.2
" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.
11
" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
...
...
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
View file @
3fb4b5fa
...
...
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.
9.2
" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.
11
" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
...
...
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
3fb4b5fa
...
...
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name
=
"xpu_
${
BUILDKITE_COMMIT
}
_
$(
tr
-dc
A-Za-z0-9 < /dev/urandom |
head
-c
10
;
echo
)
"
# Try building the docker image
docker build
-t
${
image_name
}
-f
docker/Dockerfile.xpu
.
docker build
-t
"
${
image_name
}
"
-f
docker/Dockerfile.xpu
.
# Setup cleanup
remove_docker_container
()
{
...
...
@@ -34,17 +34,17 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/
basic/
generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/offline_inference/
basic/
generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/
basic/
generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/
basic/
generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/
basic/
offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/
basic/
offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/
basic/
offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/
basic/
offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
--ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
...
...
Prev
1
2
3
4
5
6
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment