Commit 82e40fb7 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.15.0rc1' into v0.15.0rc1-ori

parents 30a1922e 58996f35
name: vllm_ci name: vllm_ci
job_dirs: job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build" - ".buildkite/image_build"
- ".buildkite/test_areas"
- ".buildkite/hardware_tests"
run_all_patterns: run_all_patterns:
- "docker/Dockerfile" - "docker/Dockerfile"
- "CMakeLists.txt" - "CMakeLists.txt"
......
group: Hardware
steps:
- label: "AMD: :docker: build image"
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1
group: Hardware
steps:
- label: "Arm CPU Test"
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
group: Hardware
depends_on: ~
steps:
- label: "Ascend NPU Test"
soft_fail: true
timeout_in_minutes: 20
no_plugin: true
device: ascend_npu
commands:
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
group: Hardware
steps:
- label: "GH200 Test"
soft_fail: true
device: gh200
no_plugin: true
optional: true
commands:
- nvidia-smi
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
group: Hardware
depends_on: ~
steps:
- label: "Intel CPU Test"
soft_fail: true
device: intel_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
- label: "Intel HPU Test"
soft_fail: true
device: intel_hpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- label: "Intel GPU Test"
soft_fail: true
device: intel_gpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
#!/bin/bash #!/bin/bash
set -e set -euo pipefail
if [[ $# -lt 8 ]]; then # replace invalid characters in Docker image tags and truncate to 128 chars
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>" clean_docker_tag() {
exit 1 local input="$1"
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
}
print_instance_info() {
echo ""
echo "=== Debug: Instance Information ==="
# Get IMDSv2 token
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
echo "AMI ID: ${AMI_ID}"
echo "Instance Type: ${INSTANCE_TYPE}"
echo "Instance ID: ${INSTANCE_ID}"
echo "AZ: ${AZ}"
else
echo "Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if [[ -f /etc/vllm-ami-info ]]; then
echo "Cache: warm (custom vLLM AMI)"
cat /etc/vllm-ami-info
else
echo "Cache: cold (standard AMI)"
fi
echo "==================================="
echo ""
}
setup_buildx_builder() {
echo "--- :buildkite: Setting up buildx builder"
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
echo "Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo "Creating baked-vllm-builder with remote driver"
docker buildx create \
--name baked-vllm-builder \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi
# builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
}
check_and_skip_if_image_exists() {
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi
}
ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags() {
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
cache="${MAIN_CACHE_ECR}:latest"
else
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
cache="${TEST_CACHE_ECR}:${clean_branch}"
fi
CACHE_TO="$cache"
CACHE_FROM="$cache"
CACHE_FROM_BASE_BRANCH="$cache"
else
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
else
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
fi
fi
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit() {
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi
}
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
}
#################################
# Main Script #
#################################
print_instance_info
if [[ $# -lt 7 ]]; then
print_usage_and_exit
fi fi
# input args
REGISTRY=$1 REGISTRY=$1
REPO=$2 REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
BRANCH=$4 BRANCH=$4
VLLM_USE_PRECOMPILED=$5 VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6 VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7 IMAGE_TAG=$7
CACHE_TO=$8 IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then # build config
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" TARGET="test-ci"
else CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
merge_base_commit_build_args="" VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
prepare_cache_tags
ecr_login
# Environment info (for docs and human readers)
# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch)
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl)
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (exported as environment variables for bake):
export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
# print build configuration
echo "--- :mag: Build configuration"
echo "TARGET: ${TARGET}"
echo "CI HCL URL: ${CI_HCL_URL}"
echo "vLLM bake file: ${VLLM_BAKE_FILE}"
echo "BUILDER_NAME: ${BUILDER_NAME}"
echo "CI_HCL_PATH: ${CI_HCL_PATH}"
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
echo "--- :mag: Cache tags"
echo "CACHE_TO: ${CACHE_TO}"
echo "CACHE_FROM: ${CACHE_FROM}"
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "CI HCL URL: ${CI_HCL_URL}"
echo "vLLM bake file: ${VLLM_BAKE_FILE}"
if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
echo "Make sure you're running from the vLLM repository root"
exit 1
fi fi
# build echo "--- :arrow_down: Downloading ci.hcl"
docker buildx build --file docker/Dockerfile \ curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
--build-arg max_jobs=16 \ echo "Downloaded to ${CI_HCL_PATH}"
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \ setup_buildx_builder
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ # Compute parent commit for cache fallback (if not already set)
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ resolve_parent_commit
${merge_base_commit_build_args} \ export PARENT_COMMIT
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \ print_bake_config
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ echo "--- :docker: Building ${TARGET}"
--push \ docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
--target test \
--progress plain . echo "--- :white_check_mark: Build complete"
...@@ -4,7 +4,8 @@ steps: ...@@ -4,7 +4,8 @@ steps:
key: image-build key: image-build
depends_on: [] depends_on: []
commands: commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi
retry: retry:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost
......
...@@ -1131,7 +1131,7 @@ steps: ...@@ -1131,7 +1131,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
......
...@@ -1017,7 +1017,7 @@ steps: ...@@ -1017,7 +1017,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
...@@ -1316,7 +1316,7 @@ steps: ...@@ -1316,7 +1316,7 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
- label: Pipeline + Context Parallelism Test # 45min - label: Pipeline + Context Parallelism Test # 45min
timeout_in_minutes: 60 timeout_in_minutes: 60
......
...@@ -4,7 +4,7 @@ depends_on: ...@@ -4,7 +4,7 @@ depends_on:
steps: steps:
- label: V1 attention (H100) - label: V1 attention (H100)
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: h100 device: h100
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py - vllm/config/attention.py
- vllm/model_executor/layers/attention - vllm/model_executor/layers/attention
...@@ -15,7 +15,7 @@ steps: ...@@ -15,7 +15,7 @@ steps:
- label: V1 attention (B200) - label: V1 attention (B200)
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: b200 device: b200
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py - vllm/config/attention.py
- vllm/model_executor/layers/attention - vllm/model_executor/layers/attention
......
...@@ -5,7 +5,7 @@ steps: ...@@ -5,7 +5,7 @@ steps:
- label: Fusion and Compile Tests (B200) - label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40 timeout_in_minutes: 40
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 device: b200
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
...@@ -26,7 +26,7 @@ steps: ...@@ -26,7 +26,7 @@ steps:
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set # this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml # Wrap with quotes to escape yaml
...@@ -37,9 +37,9 @@ steps: ...@@ -37,9 +37,9 @@ steps:
- label: Fusion E2E (2 GPUs)(B200) - label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40 timeout_in_minutes: 40
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 device: b200
optional: true optional: true
num_gpus: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
......
...@@ -5,7 +5,7 @@ steps: ...@@ -5,7 +5,7 @@ steps:
- label: Distributed Comm Ops - label: Distributed Comm Ops
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- vllm/distributed - vllm/distributed
- tests/distributed - tests/distributed
...@@ -18,7 +18,7 @@ steps: ...@@ -18,7 +18,7 @@ steps:
- label: Distributed (2 GPUs) - label: Distributed (2 GPUs)
timeout_in_minutes: 90 timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- vllm/compilation/ - vllm/compilation/
- vllm/distributed/ - vllm/distributed/
...@@ -54,7 +54,7 @@ steps: ...@@ -54,7 +54,7 @@ steps:
- label: Distributed Tests (4 GPUs) - label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- tests/distributed/test_utils - tests/distributed/test_utils
...@@ -103,8 +103,8 @@ steps: ...@@ -103,8 +103,8 @@ steps:
- label: Distributed Tests (8 GPUs)(H100) - label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10 timeout_in_minutes: 10
gpu: h100 device: h100
num_gpus: 8 num_devices: 8
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py - examples/offline_inference/torchrun_dp_example.py
...@@ -120,9 +120,9 @@ steps: ...@@ -120,9 +120,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100) - label: Distributed Tests (4 GPUs)(A100)
gpu: a100 device: a100
optional: true optional: true
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
commands: commands:
...@@ -133,26 +133,34 @@ steps: ...@@ -133,26 +133,34 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200) - label: Sequence Parallel Tests (H100)
gpu: h200 timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run sequence parallel tests
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- label: Distributed Tests (2 GPUs)(H100)
device: h100
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_devices: 2
commands: commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
gpu: b200 device: b200
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_devices: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
...@@ -161,8 +169,9 @@ steps: ...@@ -161,8 +169,9 @@ steps:
- label: 2 Node Test (4 GPUs) - label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_devices: 2
num_nodes: 2 num_nodes: 2
no_plugin: true
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
...@@ -176,7 +185,7 @@ steps: ...@@ -176,7 +185,7 @@ steps:
- label: Distributed NixlConnector PD accuracy (4 GPUs) - label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
...@@ -184,10 +193,21 @@ steps: ...@@ -184,10 +193,21 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)) - label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
...@@ -196,4 +216,46 @@ steps: ...@@ -196,4 +216,46 @@ steps:
- tests/distributed/ - tests/distributed/
commands: commands:
- pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
- label: Hopper Fusion E2E Tests (H100)
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
device: h100
optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# skip Llama-4 since it does not fit on this device
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
- label: Hopper Fusion Distributed E2E Tests (2xH100)
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
...@@ -4,27 +4,27 @@ depends_on: ...@@ -4,27 +4,27 @@ depends_on:
steps: steps:
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60
gpu: h100 device: h100
optional: true optional: true
num_gpus: 4 num_devices: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60
gpu: h100 device: h100
optional: true optional: true
num_gpus: 4 num_devices: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60
gpu: b200 device: b200
optional: true optional: true
num_gpus: 2 num_devices: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
...@@ -33,10 +33,11 @@ steps: ...@@ -33,10 +33,11 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
optional: true optional: true
soft_fail: true soft_fail: true
num_gpus: 2 num_devices: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
...@@ -23,4 +23,8 @@ steps: ...@@ -23,4 +23,8 @@ steps:
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
- pytest -v -s v1/engine # Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
...@@ -14,7 +14,7 @@ steps: ...@@ -14,7 +14,7 @@ steps:
- label: EPLB Execution - label: EPLB Execution
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/eplb - vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py - tests/distributed/test_eplb_execute.py
......
...@@ -57,8 +57,8 @@ steps: ...@@ -57,8 +57,8 @@ steps:
- label: Kernels DeepGEMM Test (H100) - label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45 timeout_in_minutes: 45
gpu: h100 device: h100
num_gpus: 1 num_devices: 1
source_file_dependencies: source_file_dependencies:
- tools/install_deepgemm.sh - tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py - vllm/utils/deep_gemm.py
...@@ -77,7 +77,7 @@ steps: ...@@ -77,7 +77,7 @@ steps:
- label: Kernels (B200) - label: Kernels (B200)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 device: b200
# optional: true # optional: true
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
...@@ -85,7 +85,7 @@ steps: ...@@ -85,7 +85,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
...@@ -114,4 +114,55 @@ steps: ...@@ -114,4 +114,55 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file # e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Kernels Helion Test
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
device: h100
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
device: h100
num_devices: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
...@@ -12,9 +12,9 @@ steps: ...@@ -12,9 +12,9 @@ steps:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100) - label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100 device: a100
optional: true optional: true
num_gpus: 4 num_devices: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
...@@ -24,9 +24,9 @@ steps: ...@@ -24,9 +24,9 @@ steps:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100) - label: LM Eval Large Models (4 GPUs)(H100)
gpu: h100 device: h100
optional: true optional: true
num_gpus: 4 num_devices: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
...@@ -37,10 +37,39 @@ steps: ...@@ -37,10 +37,39 @@ steps:
- label: LM Eval Small Models (B200) - label: LM Eval Small Models (B200)
timeout_in_minutes: 120 timeout_in_minutes: 120
gpu: b200 device: b200
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200
optional: true
num_devices: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
gpu: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
...@@ -14,7 +14,7 @@ steps: ...@@ -14,7 +14,7 @@ steps:
- label: LoRA TP (Distributed) - label: LoRA TP (Distributed)
timeout_in_minutes: 30 timeout_in_minutes: 30
num_gpus: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
......
...@@ -31,7 +31,7 @@ steps: ...@@ -31,7 +31,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
no_gpu: true device: cpu
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core - pytest -v -s -m 'cpu_test' v1/core
...@@ -82,7 +82,7 @@ steps: ...@@ -82,7 +82,7 @@ steps:
- label: Metrics, Tracing (2 GPUs) - label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20 timeout_in_minutes: 20
num_gpus: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1/tracing - tests/v1/tracing
...@@ -127,7 +127,7 @@ steps: ...@@ -127,7 +127,7 @@ steps:
- tests/tool_parsers - tests/tool_parsers
- tests/transformers_utils - tests/transformers_utils
- tests/config - tests/config
no_gpu: true device: cpu
commands: commands:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
...@@ -142,7 +142,7 @@ steps: ...@@ -142,7 +142,7 @@ steps:
- label: GPT-OSS Eval (B200) - label: GPT-OSS Eval (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 device: b200
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
...@@ -155,7 +155,7 @@ steps: ...@@ -155,7 +155,7 @@ steps:
- label: Batch Invariance (H100) - label: Batch Invariance (H100)
timeout_in_minutes: 25 timeout_in_minutes: 25
gpu: h100 device: h100
source_file_dependencies: source_file_dependencies:
- vllm/v1/attention - vllm/v1/attention
- vllm/model_executor/layers - vllm/model_executor/layers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment