Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
82e40fb7
Commit
82e40fb7
authored
Jan 27, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.15.0rc1' into v0.15.0rc1-ori
parents
30a1922e
58996f35
Changes
314
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
526 additions
and
100 deletions
+526
-100
.buildkite/ci_config.yaml
.buildkite/ci_config.yaml
+2
-1
.buildkite/hardware_tests/amd.yaml
.buildkite/hardware_tests/amd.yaml
+28
-0
.buildkite/hardware_tests/arm.yaml
.buildkite/hardware_tests/arm.yaml
+8
-0
.buildkite/hardware_tests/ascend_npu.yaml
.buildkite/hardware_tests/ascend_npu.yaml
+10
-0
.buildkite/hardware_tests/gh200.yaml
.buildkite/hardware_tests/gh200.yaml
+10
-0
.buildkite/hardware_tests/intel.yaml
.buildkite/hardware_tests/intel.yaml
+23
-0
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.sh
+241
-43
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build.yaml
+2
-1
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+1
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-2
.buildkite/test_areas/attention.yaml
.buildkite/test_areas/attention.yaml
+2
-2
.buildkite/test_areas/compile.yaml
.buildkite/test_areas/compile.yaml
+4
-4
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+83
-21
.buildkite/test_areas/e2e_integration.yaml
.buildkite/test_areas/e2e_integration.yaml
+8
-7
.buildkite/test_areas/engine.yaml
.buildkite/test_areas/engine.yaml
+5
-1
.buildkite/test_areas/expert_parallelism.yaml
.buildkite/test_areas/expert_parallelism.yaml
+1
-1
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/kernels.yaml
+56
-5
.buildkite/test_areas/lm_eval.yaml
.buildkite/test_areas/lm_eval.yaml
+34
-5
.buildkite/test_areas/lora.yaml
.buildkite/test_areas/lora.yaml
+1
-1
.buildkite/test_areas/misc.yaml
.buildkite/test_areas/misc.yaml
+5
-5
No files found.
.buildkite/ci_config.yaml
View file @
82e40fb7
name
:
vllm_ci
name
:
vllm_ci
job_dirs
:
job_dirs
:
-
"
.buildkite/test_areas"
-
"
.buildkite/image_build"
-
"
.buildkite/image_build"
-
"
.buildkite/test_areas"
-
"
.buildkite/hardware_tests"
run_all_patterns
:
run_all_patterns
:
-
"
docker/Dockerfile"
-
"
docker/Dockerfile"
-
"
CMakeLists.txt"
-
"
CMakeLists.txt"
...
...
.buildkite/hardware_tests/amd.yaml
0 → 100644
View file @
82e40fb7
group
:
Hardware
steps
:
-
label
:
"
AMD:
:docker:
build
image"
device
:
amd_cpu
no_plugin
:
true
commands
:
-
>
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
-
docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env
:
DOCKER_BUILDKIT
:
"
1"
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
limit
:
1
-
exit_status
:
-10
# Agent was lost
limit
:
1
-
exit_status
:
1
# Machine occasionally fail
limit
:
1
.buildkite/hardware_tests/arm.yaml
0 → 100644
View file @
82e40fb7
group
:
Hardware
steps
:
-
label
:
"
Arm
CPU
Test"
soft_fail
:
true
device
:
arm_cpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/hardware_tests/ascend_npu.yaml
0 → 100644
View file @
82e40fb7
group
:
Hardware
depends_on
:
~
steps
:
-
label
:
"
Ascend
NPU
Test"
soft_fail
:
true
timeout_in_minutes
:
20
no_plugin
:
true
device
:
ascend_npu
commands
:
-
bash .buildkite/scripts/hardware_ci/run-npu-test.sh
.buildkite/hardware_tests/gh200.yaml
0 → 100644
View file @
82e40fb7
group
:
Hardware
steps
:
-
label
:
"
GH200
Test"
soft_fail
:
true
device
:
gh200
no_plugin
:
true
optional
:
true
commands
:
-
nvidia-smi
-
bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
.buildkite/hardware_tests/intel.yaml
0 → 100644
View file @
82e40fb7
group
:
Hardware
depends_on
:
~
steps
:
-
label
:
"
Intel
CPU
Test"
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
-
label
:
"
Intel
HPU
Test"
soft_fail
:
true
device
:
intel_hpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
-
label
:
"
Intel
GPU
Test"
soft_fail
:
true
device
:
intel_gpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/image_build/image_build.sh
View file @
82e40fb7
#!/bin/bash
#!/bin/bash
set
-e
set
-e
uo
pipefail
if
[[
$#
-lt
8
]]
;
then
# replace invalid characters in Docker image tags and truncate to 128 chars
clean_docker_tag
()
{
local
input
=
"
$1
"
echo
"
$input
"
|
sed
's/[^a-zA-Z0-9._-]/_/g'
|
cut
-c1-128
}
print_usage_and_exit
()
{
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit
1
exit
1
}
print_instance_info
()
{
echo
""
echo
"=== Debug: Instance Information ==="
# Get IMDSv2 token
if
TOKEN
=
$(
curl
-s
-X
PUT
"http://169.254.169.254/latest/api/token"
\
-H
"X-aws-ec2-metadata-token-ttl-seconds: 21600"
2>/dev/null
)
;
then
AMI_ID
=
$(
curl
-s
-H
"X-aws-ec2-metadata-token:
$TOKEN
"
\
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null
||
echo
"unknown"
)
INSTANCE_TYPE
=
$(
curl
-s
-H
"X-aws-ec2-metadata-token:
$TOKEN
"
\
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null
||
echo
"unknown"
)
INSTANCE_ID
=
$(
curl
-s
-H
"X-aws-ec2-metadata-token:
$TOKEN
"
\
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null
||
echo
"unknown"
)
AZ
=
$(
curl
-s
-H
"X-aws-ec2-metadata-token:
$TOKEN
"
\
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null
||
echo
"unknown"
)
echo
"AMI ID:
${
AMI_ID
}
"
echo
"Instance Type:
${
INSTANCE_TYPE
}
"
echo
"Instance ID:
${
INSTANCE_ID
}
"
echo
"AZ:
${
AZ
}
"
else
echo
"Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if
[[
-f
/etc/vllm-ami-info
]]
;
then
echo
"Cache: warm (custom vLLM AMI)"
cat
/etc/vllm-ami-info
else
echo
"Cache: cold (standard AMI)"
fi
echo
"==================================="
echo
""
}
setup_buildx_builder
()
{
echo
"--- :buildkite: Setting up buildx builder"
if
[[
-S
"
${
BUILDKIT_SOCKET
}
"
]]
;
then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo
"✅ Found local buildkitd socket at
${
BUILDKIT_SOCKET
}
"
echo
"Using remote driver to connect to buildkitd (warm cache available)"
if
docker buildx inspect baked-vllm-builder
>
/dev/null 2>&1
;
then
echo
"Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo
"Creating baked-vllm-builder with remote driver"
docker buildx create
\
--name
baked-vllm-builder
\
--driver
remote
\
--use
\
"unix://
${
BUILDKIT_SOCKET
}
"
fi
docker buildx inspect
--bootstrap
elif
docker buildx inspect
"
${
BUILDER_NAME
}
"
>
/dev/null 2>&1
;
then
# Existing builder available
echo
"Using existing builder:
${
BUILDER_NAME
}
"
docker buildx use
"
${
BUILDER_NAME
}
"
docker buildx inspect
--bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo
"No local buildkitd found, using docker-container driver"
docker buildx create
--name
"
${
BUILDER_NAME
}
"
--driver
docker-container
--use
docker buildx inspect
--bootstrap
fi
# builder info
echo
"Active builder:"
docker buildx
ls
|
grep
-E
'^\*|^NAME'
||
docker buildx
ls
}
check_and_skip_if_image_exists
()
{
if
[[
-n
"
${
IMAGE_TAG
:-}
"
]]
;
then
echo
"--- :mag: Checking if image exists"
if
docker manifest inspect
"
${
IMAGE_TAG
}
"
>
/dev/null 2>&1
;
then
echo
"Image already exists:
${
IMAGE_TAG
}
"
echo
"Skipping build"
exit
0
fi
echo
"Image not found, proceeding with build"
fi
}
ecr_login
()
{
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
aws ecr get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags
()
{
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR
=
"936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR
=
"936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if
[[
"
$BUILDKITE_PULL_REQUEST
"
==
"false"
]]
;
then
if
[[
"
$BUILDKITE_BRANCH
"
==
"main"
]]
;
then
cache
=
"
${
MAIN_CACHE_ECR
}
:latest"
else
clean_branch
=
$(
clean_docker_tag
"
$BUILDKITE_BRANCH
"
)
cache
=
"
${
TEST_CACHE_ECR
}
:
${
clean_branch
}
"
fi
CACHE_TO
=
"
$cache
"
CACHE_FROM
=
"
$cache
"
CACHE_FROM_BASE_BRANCH
=
"
$cache
"
else
CACHE_TO
=
"
${
TEST_CACHE_ECR
}
:pr-
${
BUILDKITE_PULL_REQUEST
}
"
CACHE_FROM
=
"
${
TEST_CACHE_ECR
}
:pr-
${
BUILDKITE_PULL_REQUEST
}
"
if
[[
"
$BUILDKITE_PULL_REQUEST_BASE_BRANCH
"
==
"main"
]]
;
then
CACHE_FROM_BASE_BRANCH
=
"
${
MAIN_CACHE_ECR
}
:latest"
else
clean_base
=
$(
clean_docker_tag
"
$BUILDKITE_PULL_REQUEST_BASE_BRANCH
"
)
CACHE_FROM_BASE_BRANCH
=
"
${
TEST_CACHE_ECR
}
:
${
clean_base
}
"
fi
fi
CACHE_FROM_MAIN
=
"
${
MAIN_CACHE_ECR
}
:latest"
export
CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit
()
{
if
[[
-z
"
${
PARENT_COMMIT
:-}
"
]]
;
then
PARENT_COMMIT
=
$(
git rev-parse HEAD~1 2>/dev/null
||
echo
""
)
if
[[
-n
"
${
PARENT_COMMIT
}
"
]]
;
then
echo
"Computed parent commit for cache fallback:
${
PARENT_COMMIT
}
"
export
PARENT_COMMIT
else
echo
"Could not determine parent commit (may be first commit in repo)"
fi
else
echo
"Using provided PARENT_COMMIT:
${
PARENT_COMMIT
}
"
fi
}
print_bake_config
()
{
echo
"--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE
=
"bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
docker buildx bake
-f
"
${
VLLM_BAKE_FILE
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload
"
${
BAKE_CONFIG_FILE
}
"
}
#################################
# Main Script #
#################################
print_instance_info
if
[[
$#
-lt
7
]]
;
then
print_usage_and_exit
fi
fi
# input args
REGISTRY
=
$1
REGISTRY
=
$1
REPO
=
$2
REPO
=
$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
BRANCH
=
$4
BRANCH
=
$4
VLLM_USE_PRECOMPILED
=
$5
VLLM_USE_PRECOMPILED
=
$5
VLLM_MERGE_BASE_COMMIT
=
$6
VLLM_MERGE_BASE_COMMIT
=
$6
CACHE_FROM
=
$7
IMAGE_TAG
=
$7
CACHE_TO
=
$8
IMAGE_TAG_LATEST
=
${
8
:-}
# only used for main branch, optional
# authenticate with AWS ECR
# build config
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
TARGET
=
"test-ci"
aws ecr get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
936637512419.dkr.ecr.us-east-1.amazonaws.com
CI_HCL_URL
=
"
${
CI_HCL_URL
:-
https
://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl
}
"
VLLM_BAKE_FILE
=
"
${
VLLM_BAKE_FILE
:-
docker
/docker-bake.hcl
}
"
# docker buildx
BUILDER_NAME
=
"
${
BUILDER_NAME
:-
vllm
-builder
}
"
docker buildx create
--name
vllm-builder
--driver
docker-container
--use
CI_HCL_PATH
=
"/tmp/ci.hcl"
docker buildx inspect
--bootstrap
BUILDKIT_SOCKET
=
"/run/buildkit/buildkitd.sock"
docker buildx
ls
prepare_cache_tags
# skip build if image already exists
ecr_login
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
)
]]
;
then
echo
"Image not found, proceeding with build..."
# Environment info (for docs and human readers)
else
# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch)
echo
"Image found"
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
exit
0
# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl)
fi
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (exported as environment variables for bake):
export
BUILDKITE_COMMIT
export
PARENT_COMMIT
export
IMAGE_TAG
export
IMAGE_TAG_LATEST
export
CACHE_FROM
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_MAIN
export
CACHE_TO
export
VLLM_USE_PRECOMPILED
export
VLLM_MERGE_BASE_COMMIT
# print args
echo
"--- :mag: Arguments"
echo
"REGISTRY:
${
REGISTRY
}
"
echo
"REPO:
${
REPO
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"VLLM_USE_PRECOMPILED:
${
VLLM_USE_PRECOMPILED
}
"
echo
"VLLM_MERGE_BASE_COMMIT:
${
VLLM_MERGE_BASE_COMMIT
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
# print build configuration
echo
"--- :mag: Build configuration"
echo
"TARGET:
${
TARGET
}
"
echo
"CI HCL URL:
${
CI_HCL_URL
}
"
echo
"vLLM bake file:
${
VLLM_BAKE_FILE
}
"
echo
"BUILDER_NAME:
${
BUILDER_NAME
}
"
echo
"CI_HCL_PATH:
${
CI_HCL_PATH
}
"
echo
"BUILDKIT_SOCKET:
${
BUILDKIT_SOCKET
}
"
echo
"--- :mag: Cache tags"
echo
"CACHE_TO:
${
CACHE_TO
}
"
echo
"CACHE_FROM:
${
CACHE_FROM
}
"
echo
"CACHE_FROM_BASE_BRANCH:
${
CACHE_FROM_BASE_BRANCH
}
"
echo
"CACHE_FROM_MAIN:
${
CACHE_FROM_MAIN
}
"
check_and_skip_if_image_exists
if
[[
"
${
VLLM_USE_PRECOMPILED
:-
0
}
"
==
"1"
]]
;
then
echo
"--- :docker: Setting up Docker buildx bake"
merge_base_commit_build_args
=
"--build-arg VLLM_MERGE_BASE_COMMIT=
${
VLLM_MERGE_BASE_COMMIT
}
"
echo
"Target:
${
TARGET
}
"
else
echo
"CI HCL URL:
${
CI_HCL_URL
}
"
merge_base_commit_build_args
=
""
echo
"vLLM bake file:
${
VLLM_BAKE_FILE
}
"
if
[[
!
-f
"
${
VLLM_BAKE_FILE
}
"
]]
;
then
echo
"Error: vLLM bake file not found at
${
VLLM_BAKE_FILE
}
"
echo
"Make sure you're running from the vLLM repository root"
exit
1
fi
fi
# build
echo
"--- :arrow_down: Downloading ci.hcl"
docker buildx build
--file
docker/Dockerfile
\
curl
-sSfL
-o
"
${
CI_HCL_PATH
}
"
"
${
CI_HCL_URL
}
"
--build-arg
max_jobs
=
16
\
echo
"Downloaded to
${
CI_HCL_PATH
}
"
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
USE_SCCACHE
=
1
\
setup_buildx_builder
--build-arg
TORCH_CUDA_ARCH_LIST
=
"8.0 8.9 9.0 10.0"
\
--build-arg
FI_TORCH_CUDA_ARCH_LIST
=
"8.0 8.9 9.0a 10.0a"
\
# Compute parent commit for cache fallback (if not already set)
--build-arg
VLLM_USE_PRECOMPILED
=
"
${
VLLM_USE_PRECOMPILED
:-
0
}
"
\
resolve_parent_commit
${
merge_base_commit_build_args
}
\
export
PARENT_COMMIT
--cache-from
type
=
registry,ref
=
${
CACHE_FROM
}
,mode
=
max
\
--cache-to
type
=
registry,ref
=
${
CACHE_TO
}
,mode
=
max
\
print_bake_config
--tag
${
REGISTRY
}
/
${
REPO
}
:
${
BUILDKITE_COMMIT
}
\
$(
[[
"
${
BRANCH
}
"
==
"main"
]]
&&
echo
"--tag
${
REGISTRY
}
/
${
REPO
}
:latest"
)
\
echo
"--- :docker: Building
${
TARGET
}
"
--push
\
docker
--debug
buildx bake
-f
"
${
VLLM_BAKE_FILE
}
"
-f
"
${
CI_HCL_PATH
}
"
--progress
plain
"
${
TARGET
}
"
--target
test
\
--progress
plain
.
echo
"--- :white_check_mark: Build complete"
.buildkite/image_build/image_build.yaml
View file @
82e40fb7
...
@@ -4,7 +4,8 @@ steps:
...
@@ -4,7 +4,8 @@ steps:
key
:
image-build
key
:
image-build
depends_on
:
[]
depends_on
:
[]
commands
:
commands
:
-
.buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
-
if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi
retry
:
retry
:
automatic
:
automatic
:
-
exit_status
:
-1
# Agent was lost
-
exit_status
:
-1
# Agent was lost
...
...
.buildkite/test-amd.yaml
View file @
82e40fb7
...
@@ -1131,7 +1131,7 @@ steps:
...
@@ -1131,7 +1131,7 @@ steps:
-
csrc/quantization/cutlass_w8a8/moe/
-
csrc/quantization/cutlass_w8a8/moe/
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
cutlass
_prepare_finalize.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
a2a
_prepare_finalize.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
...
...
.buildkite/test-pipeline.yaml
View file @
82e40fb7
...
@@ -1017,7 +1017,7 @@ steps:
...
@@ -1017,7 +1017,7 @@ steps:
-
csrc/quantization/cutlass_w8a8/moe/
-
csrc/quantization/cutlass_w8a8/moe/
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
cutlass
_prepare_finalize.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
a2a
_prepare_finalize.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
...
@@ -1316,7 +1316,7 @@ steps:
...
@@ -1316,7 +1316,7 @@ steps:
-
pytest -v -s distributed/test_distributed_oot.py
-
pytest -v -s distributed/test_distributed_oot.py
-
pytest -v -s entrypoints/openai/test_oot_registration.py
# it needs a clean process
-
pytest -v -s entrypoints/openai/test_oot_registration.py
# it needs a clean process
-
pytest -v -s models/test_oot_registration.py
# it needs a clean process
-
pytest -v -s models/test_oot_registration.py
# it needs a clean process
-
pytest -v -s plugins/lora_resolvers
# unit tests for
in-tree
lora resolver plugins
-
pytest -v -s plugins/lora_resolvers
# unit tests for lora resolver plugins
-
label
:
Pipeline + Context Parallelism Test
# 45min
-
label
:
Pipeline + Context Parallelism Test
# 45min
timeout_in_minutes
:
60
timeout_in_minutes
:
60
...
...
.buildkite/test_areas/attention.yaml
View file @
82e40fb7
...
@@ -4,7 +4,7 @@ depends_on:
...
@@ -4,7 +4,7 @@ depends_on:
steps
:
steps
:
-
label
:
V1 attention (H100)
-
label
:
V1 attention (H100)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
gpu
:
h100
device
:
h100
source_file_dependencies
:
source_file_dependencies
:
-
vllm/config/attention.py
-
vllm/config/attention.py
-
vllm/model_executor/layers/attention
-
vllm/model_executor/layers/attention
...
@@ -15,7 +15,7 @@ steps:
...
@@ -15,7 +15,7 @@ steps:
-
label
:
V1 attention (B200)
-
label
:
V1 attention (B200)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
gpu
:
b200
device
:
b200
source_file_dependencies
:
source_file_dependencies
:
-
vllm/config/attention.py
-
vllm/config/attention.py
-
vllm/model_executor/layers/attention
-
vllm/model_executor/layers/attention
...
...
.buildkite/test_areas/compile.yaml
View file @
82e40fb7
...
@@ -5,7 +5,7 @@ steps:
...
@@ -5,7 +5,7 @@ steps:
-
label
:
Fusion and Compile Tests (B200)
-
label
:
Fusion and Compile Tests (B200)
timeout_in_minutes
:
40
timeout_in_minutes
:
40
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
device
:
b200
source_file_dependencies
:
source_file_dependencies
:
-
csrc/quantization/fp4/
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
...
@@ -26,7 +26,7 @@ steps:
...
@@ -26,7 +26,7 @@ steps:
-
nvidia-smi
-
nvidia-smi
-
pytest -v -s tests/compile/test_fusion_attn.py
-
pytest -v -s tests/compile/test_fusion_attn.py
-
pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-
pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_
gpu
s=2 is not set
# this runner has 2 GPUs available even though num_
device
s=2 is not set
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
# Wrap with quotes to escape yaml
...
@@ -37,9 +37,9 @@ steps:
...
@@ -37,9 +37,9 @@ steps:
-
label
:
Fusion E2E (2 GPUs)(B200)
-
label
:
Fusion E2E (2 GPUs)(B200)
timeout_in_minutes
:
40
timeout_in_minutes
:
40
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
device
:
b200
optional
:
true
optional
:
true
num_
gpu
s
:
2
num_
device
s
:
2
source_file_dependencies
:
source_file_dependencies
:
-
csrc/quantization/fp4/
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
...
...
.buildkite/test_areas/distributed.yaml
View file @
82e40fb7
...
@@ -5,7 +5,7 @@ steps:
...
@@ -5,7 +5,7 @@ steps:
-
label
:
Distributed Comm Ops
-
label
:
Distributed Comm Ops
timeout_in_minutes
:
20
timeout_in_minutes
:
20
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
2
num_
device
s
:
2
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed
-
vllm/distributed
-
tests/distributed
-
tests/distributed
...
@@ -18,7 +18,7 @@ steps:
...
@@ -18,7 +18,7 @@ steps:
-
label
:
Distributed (2 GPUs)
-
label
:
Distributed (2 GPUs)
timeout_in_minutes
:
90
timeout_in_minutes
:
90
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
2
num_
device
s
:
2
source_file_dependencies
:
source_file_dependencies
:
-
vllm/compilation/
-
vllm/compilation/
-
vllm/distributed/
-
vllm/distributed/
...
@@ -54,7 +54,7 @@ steps:
...
@@ -54,7 +54,7 @@ steps:
-
label
:
Distributed Tests (4 GPUs)
-
label
:
Distributed Tests (4 GPUs)
timeout_in_minutes
:
50
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/
-
vllm/distributed/
-
tests/distributed/test_utils
-
tests/distributed/test_utils
...
@@ -103,8 +103,8 @@ steps:
...
@@ -103,8 +103,8 @@ steps:
-
label
:
Distributed Tests (8 GPUs)(H100)
-
label
:
Distributed Tests (8 GPUs)(H100)
timeout_in_minutes
:
10
timeout_in_minutes
:
10
gpu
:
h100
device
:
h100
num_
gpu
s
:
8
num_
device
s
:
8
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
source_file_dependencies
:
-
examples/offline_inference/torchrun_dp_example.py
-
examples/offline_inference/torchrun_dp_example.py
...
@@ -120,9 +120,9 @@ steps:
...
@@ -120,9 +120,9 @@ steps:
-
torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
label
:
Distributed Tests (4 GPUs)(A100)
-
label
:
Distributed Tests (4 GPUs)(A100)
gpu
:
a100
device
:
a100
optional
:
true
optional
:
true
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
commands
:
commands
:
...
@@ -133,26 +133,34 @@ steps:
...
@@ -133,26 +133,34 @@ steps:
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
Distributed Tests (2 GPUs)(H200)
-
label
:
Sequence Parallel Tests (H100)
gpu
:
h200
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
device
:
h100
optional
:
true
num_devices
:
2
commands
:
-
export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run sequence parallel tests
-
pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
label
:
Distributed Tests (2 GPUs)(H100)
device
:
h100
optional
:
true
optional
:
true
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
num_
gpu
s
:
2
num_
device
s
:
2
commands
:
commands
:
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-
pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
-
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
Distributed Tests (2 GPUs)(B200)
-
label
:
Distributed Tests (2 GPUs)(B200)
gpu
:
b200
device
:
b200
optional
:
true
optional
:
true
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
num_
gpu
s
:
2
num_
device
s
:
2
commands
:
commands
:
-
pytest -v -s tests/distributed/test_context_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
...
@@ -161,8 +169,9 @@ steps:
...
@@ -161,8 +169,9 @@ steps:
-
label
:
2 Node Test (4 GPUs)
-
label
:
2 Node Test (4 GPUs)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
2
num_
device
s
:
2
num_nodes
:
2
num_nodes
:
2
no_plugin
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/
-
vllm/distributed/
-
vllm/engine/
-
vllm/engine/
...
@@ -176,7 +185,7 @@ steps:
...
@@ -176,7 +185,7 @@ steps:
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
-
tests/v1/kv_connector/nixl_integration/
...
@@ -184,10 +193,21 @@ steps:
...
@@ -184,10 +193,21 @@ steps:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
label
:
DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
num_devices
:
4
source_file_dependencies
:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
label
:
Pipeline + Context Parallelism (4 GPUs))
-
label
:
Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes
:
60
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/
-
vllm/distributed/
-
vllm/engine/
-
vllm/engine/
...
@@ -197,3 +217,45 @@ steps:
...
@@ -197,3 +217,45 @@ steps:
commands
:
commands
:
-
pytest -v -s distributed/test_pp_cudagraph.py
-
pytest -v -s distributed/test_pp_cudagraph.py
-
pytest -v -s distributed/test_pipeline_parallel.py
-
pytest -v -s distributed/test_pipeline_parallel.py
-
label
:
Hopper Fusion E2E Tests (H100)
timeout_in_minutes
:
70
working_dir
:
"
/vllm-workspace/"
device
:
h100
optional
:
true
source_file_dependencies
:
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/compilation/
# can affect pattern matching
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
tests/compile/test_fusion_attn.py
commands
:
-
export VLLM_TEST_CLEAN_GPU_MEMORY=1
# skip Llama-4 since it does not fit on this device
-
pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
label
:
Hopper Fusion Distributed E2E Tests (2xH100)
timeout_in_minutes
:
70
working_dir
:
"
/vllm-workspace/"
device
:
h100
optional
:
true
num_devices
:
2
source_file_dependencies
:
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/compilation/
# can affect pattern matching
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
tests/compile/distributed/test_fusions_e2e.py
commands
:
-
export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run all e2e fusion tests
-
pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
.buildkite/test_areas/e2e_integration.yaml
View file @
82e40fb7
...
@@ -4,27 +4,27 @@ depends_on:
...
@@ -4,27 +4,27 @@ depends_on:
steps
:
steps
:
-
label
:
DeepSeek V2-Lite Accuracy
-
label
:
DeepSeek V2-Lite Accuracy
timeout_in_minutes
:
60
timeout_in_minutes
:
60
gpu
:
h100
device
:
h100
optional
:
true
optional
:
true
num_
gpu
s
:
4
num_
device
s
:
4
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
commands
:
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200
8010
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200
8010
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes
:
60
timeout_in_minutes
:
60
gpu
:
h100
device
:
h100
optional
:
true
optional
:
true
num_
gpu
s
:
4
num_
device
s
:
4
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
commands
:
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200
8020
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200
8020
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy (B200)
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes
:
60
timeout_in_minutes
:
60
gpu
:
b200
device
:
b200
optional
:
true
optional
:
true
num_
gpu
s
:
2
num_
device
s
:
2
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
commands
:
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
...
@@ -33,10 +33,11 @@ steps:
...
@@ -33,10 +33,11 @@ steps:
timeout_in_minutes
:
30
timeout_in_minutes
:
30
optional
:
true
optional
:
true
soft_fail
:
true
soft_fail
:
true
num_
gpu
s
:
2
num_
device
s
:
2
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
.buildkite/scripts/run-prime-rl-test.sh
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
commands
:
-
nvidia-smi
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
bash .buildkite/scripts/run-prime-rl-test.sh
.buildkite/test_areas/engine.yaml
View file @
82e40fb7
...
@@ -23,4 +23,8 @@ steps:
...
@@ -23,4 +23,8 @@ steps:
# TODO: accuracy does not match, whether setting
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-
pytest -v -s v1/e2e
-
pytest -v -s v1/e2e
-
pytest -v -s v1/engine
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
-
pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
-
pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
.buildkite/test_areas/expert_parallelism.yaml
View file @
82e40fb7
...
@@ -14,7 +14,7 @@ steps:
...
@@ -14,7 +14,7 @@ steps:
-
label
:
EPLB Execution
-
label
:
EPLB Execution
timeout_in_minutes
:
20
timeout_in_minutes
:
20
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/eplb
-
vllm/distributed/eplb
-
tests/distributed/test_eplb_execute.py
-
tests/distributed/test_eplb_execute.py
...
...
.buildkite/test_areas/kernels.yaml
View file @
82e40fb7
...
@@ -57,8 +57,8 @@ steps:
...
@@ -57,8 +57,8 @@ steps:
-
label
:
Kernels DeepGEMM Test (H100)
-
label
:
Kernels DeepGEMM Test (H100)
timeout_in_minutes
:
45
timeout_in_minutes
:
45
gpu
:
h100
device
:
h100
num_
gpu
s
:
1
num_
device
s
:
1
source_file_dependencies
:
source_file_dependencies
:
-
tools/install_deepgemm.sh
-
tools/install_deepgemm.sh
-
vllm/utils/deep_gemm.py
-
vllm/utils/deep_gemm.py
...
@@ -77,7 +77,7 @@ steps:
...
@@ -77,7 +77,7 @@ steps:
-
label
:
Kernels (B200)
-
label
:
Kernels (B200)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
device
:
b200
# optional: true
# optional: true
source_file_dependencies
:
source_file_dependencies
:
-
csrc/quantization/fp4/
-
csrc/quantization/fp4/
...
@@ -85,7 +85,7 @@ steps:
...
@@ -85,7 +85,7 @@ steps:
-
csrc/quantization/cutlass_w8a8/moe/
-
csrc/quantization/cutlass_w8a8/moe/
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
cutlass
_prepare_finalize.py
-
vllm/model_executor/layers/fused_moe/flashinfer_
a2a
_prepare_finalize.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
...
@@ -115,3 +115,54 @@ steps:
...
@@ -115,3 +115,54 @@ steps:
-
pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-
pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-
pytest -v -s tests/kernels/moe/test_flashinfer.py
-
pytest -v -s tests/kernels/moe/test_flashinfer.py
-
pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-
pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
-
pytest -v -s tests/models/quantization/test_nvfp4.py
-
label
:
Kernels Helion Test
timeout_in_minutes
:
30
device
:
h100
source_file_dependencies
:
-
vllm/utils/import_utils.py
-
tests/kernels/helion/
commands
:
-
pip install helion
-
pytest -v -s kernels/helion/
-
label
:
Kernels FP8 MoE Test (1 H100)
timeout_in_minutes
:
90
device
:
h100
num_devices
:
1
optional
:
true
commands
:
-
pytest -v -s kernels/moe/test_cutlass_moe.py
-
pytest -v -s kernels/moe/test_flashinfer.py
-
pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
-
pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
-
pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
-
pytest -v -s kernels/moe/test_block_int8.py
-
pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
-
pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
-
label
:
Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes
:
90
device
:
h100
num_devices
:
2
optional
:
true
commands
:
-
pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
-
pytest -v -s kernels/moe/test_deepep_moe.py
-
pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
-
label
:
Kernels Fp4 MoE Test (B200)
timeout_in_minutes
:
60
device
:
b200
num_devices
:
1
optional
:
true
commands
:
-
pytest -v -s kernels/moe/test_cutedsl_moe.py
-
pytest -v -s kernels/moe/test_flashinfer_moe.py
-
pytest -v -s kernels/moe/test_nvfp4_moe.py
-
pytest -v -s kernels/moe/test_ocp_mx_moe.py
.buildkite/test_areas/lm_eval.yaml
View file @
82e40fb7
...
@@ -12,9 +12,9 @@ steps:
...
@@ -12,9 +12,9 @@ steps:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
LM Eval Large Models (4 GPUs)(A100)
-
label
:
LM Eval Large Models (4 GPUs)(A100)
gpu
:
a100
device
:
a100
optional
:
true
optional
:
true
num_
gpu
s
:
4
num_
device
s
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
...
@@ -24,9 +24,9 @@ steps:
...
@@ -24,9 +24,9 @@ steps:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
label
:
LM Eval Large Models (4 GPUs)(H100)
-
label
:
LM Eval Large Models (4 GPUs)(H100)
gpu
:
h100
device
:
h100
optional
:
true
optional
:
true
num_
gpu
s
:
4
num_
device
s
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
...
@@ -37,10 +37,39 @@ steps:
...
@@ -37,10 +37,39 @@ steps:
-
label
:
LM Eval Small Models (B200)
-
label
:
LM Eval Small Models (B200)
timeout_in_minutes
:
120
timeout_in_minutes
:
120
gpu
:
b200
device
:
b200
optional
:
true
optional
:
true
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
-
vllm/model_executor/layers/quantization
-
vllm/model_executor/layers/quantization
commands
:
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
label
:
LM Eval Large Models (H200)
timeout_in_minutes
:
60
device
:
h200
optional
:
true
num_devices
:
8
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
-
label
:
MoE Refactor Integration Test (H100 - TEMPORARY)
device
:
h100
optional
:
true
num_devices
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
-
label
:
MoE Refactor Integration Test (B200 - TEMPORARY)
gpu
:
b200
optional
:
true
num_devices
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
-
label
:
MoE Refactor Integration Test (B200 DP - TEMPORARY)
device
:
b200
optional
:
true
num_devices
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
.buildkite/test_areas/lora.yaml
View file @
82e40fb7
...
@@ -14,7 +14,7 @@ steps:
...
@@ -14,7 +14,7 @@ steps:
-
label
:
LoRA TP (Distributed)
-
label
:
LoRA TP (Distributed)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
num_
gpu
s
:
4
num_
device
s
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/lora
-
vllm/lora
-
tests/lora
-
tests/lora
...
...
.buildkite/test_areas/misc.yaml
View file @
82e40fb7
...
@@ -31,7 +31,7 @@ steps:
...
@@ -31,7 +31,7 @@ steps:
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/v1
-
tests/v1
no_gpu
:
true
device
:
cpu
commands
:
commands
:
# split the test to avoid interference
# split the test to avoid interference
-
pytest -v -s -m 'cpu_test' v1/core
-
pytest -v -s -m 'cpu_test' v1/core
...
@@ -82,7 +82,7 @@ steps:
...
@@ -82,7 +82,7 @@ steps:
-
label
:
Metrics, Tracing (2 GPUs)
-
label
:
Metrics, Tracing (2 GPUs)
timeout_in_minutes
:
20
timeout_in_minutes
:
20
num_
gpu
s
:
2
num_
device
s
:
2
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/v1/tracing
-
tests/v1/tracing
...
@@ -127,7 +127,7 @@ steps:
...
@@ -127,7 +127,7 @@ steps:
-
tests/tool_parsers
-
tests/tool_parsers
-
tests/transformers_utils
-
tests/transformers_utils
-
tests/config
-
tests/config
no_gpu
:
true
device
:
cpu
commands
:
commands
:
-
python3 standalone_tests/lazy_imports.py
-
python3 standalone_tests/lazy_imports.py
-
pytest -v -s test_inputs.py
-
pytest -v -s test_inputs.py
...
@@ -142,7 +142,7 @@ steps:
...
@@ -142,7 +142,7 @@ steps:
-
label
:
GPT-OSS Eval (B200)
-
label
:
GPT-OSS Eval (B200)
timeout_in_minutes
:
60
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
device
:
b200
optional
:
true
optional
:
true
source_file_dependencies
:
source_file_dependencies
:
-
tests/evals/gpt_oss
-
tests/evals/gpt_oss
...
@@ -155,7 +155,7 @@ steps:
...
@@ -155,7 +155,7 @@ steps:
-
label
:
Batch Invariance (H100)
-
label
:
Batch Invariance (H100)
timeout_in_minutes
:
25
timeout_in_minutes
:
25
gpu
:
h100
device
:
h100
source_file_dependencies
:
source_file_dependencies
:
-
vllm/v1/attention
-
vllm/v1/attention
-
vllm/model_executor/layers
-
vllm/model_executor/layers
...
...
Prev
1
2
3
4
5
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment