Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emojis allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for the test. incompatible with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
steps:
##### fast check tests #####
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
- label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/transformers_utils
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s transformers_utils
- label: Python-only Installation Test # 10min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 20min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
- label: Distributed Tests (8 GPUs) # 4min
timeout_in_minutes: 10
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
#- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
- label: Metrics, Tracing Test # 12min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
commands:
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
- label: Regression Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_regression
commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
- tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
- label: V1 Test others # 42min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
commands:
# split the test to avoid interference
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
- pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
# TODO: Add the "V1 Test attetion (MI300)" test group
- label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Examples Test # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
#- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 56min
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
- tests/samplers
- tests/conftest.py
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LoRA Test %N # 20min each
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora \
--shard-id=$$BUILDKITE_PARALLEL_JOB \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--ignore=lora/test_chatglm3_tp.py \
--ignore=lora/test_llama_tp.py \
--ignore=lora/test_llm_with_multi_loras.py \
--ignore=lora/test_olmoe_tp.py \
--ignore=lora/test_deepseekv2_tp.py \
--ignore=lora/test_gptoss_tp.py \
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
# - pytest -v -s compile/test_sequence_parallelism.py
# - pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/test_multimodal_compile.py
- pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
- label: Cudagraph test
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N # 64min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test # 31min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/engine/arg_utils.py
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Benchmarks # 11min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Quantization Test # 70min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness # 10min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s -m 'not cpu_test' tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test #####
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
# test.) Also run if model initialization test file is modified
- pytest -v -s models/test_initialization.py \
-k 'not test_can_initialize_small_subset' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Basic Models Tests (Other)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_transformers.py
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 10
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
no_gpu: true
commands:
- pytest -v -s models/test_utils.py models/test_vision.py
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and (not slow_test)'
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/language/pooling/test_embedding.py
- tests/models/language/generation/test_common.py
- tests/models/language/pooling/test_classification.py
commands:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation \
-m hybrid_model \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation_ppl_test
commands:
- pytest -v -s models/language/generation_ppl_test
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
- label: Language Models Test (MTEB)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling_mteb_test
commands:
- pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Quantized Models Test # 45 min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
- label: Transformers Nightly Models Test
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/"
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusions_e2e.py
- tests/compile/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Model Tests (2 GPUs) # 37min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Pipeline + Context Parallelism Test # 45min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test #####
##### A100 test #####
- label: Distributed Tests (A100) # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
- label: Distributed Tests (B200) # optional
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. # and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
# When adding a test # When adding a test
# - If the test belongs to an existing group, add it there # - If the test belongs to an existing group, add it there
...@@ -38,7 +39,7 @@ steps: ...@@ -38,7 +39,7 @@ steps:
- label: Pytorch Nightly Dependency Override Check # 2min - label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some # if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist # of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/generate_nightly_torch_test.py # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- requirements/nightly_torch_test.txt - requirements/nightly_torch_test.txt
...@@ -50,19 +51,30 @@ steps: ...@@ -50,19 +51,30 @@ steps:
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/utils_
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/transformers_utils - tests/transformers_utils
- tests/config
no_gpu: true
commands: commands:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s utils_ # Utils - pytest -v -s transformers_utils
- pytest -v -s transformers_utils # transformers_utils - pytest -v -s config
- label: Python-only Installation Test # 10min - label: Python-only Installation Test # 10min
timeout_in_minutes: 20 timeout_in_minutes: 20
...@@ -159,13 +171,12 @@ steps: ...@@ -159,13 +171,12 @@ steps:
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py - tests/v1/distributed
- tests/v1/test_external_lb_dp.py
- tests/v1/test_internal_lb_dp.py
- tests/v1/test_hybrid_lb_dp.py
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py - tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2 # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with torchrun tp=2 and pp=2
...@@ -180,10 +191,10 @@ steps: ...@@ -180,10 +191,10 @@ steps:
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
...@@ -197,6 +208,24 @@ steps: ...@@ -197,6 +208,24 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: Distributed Tests (8 GPUs) # 4min
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min - label: EPLB Algorithm Test # 5min
timeout_in_minutes: 15 timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
...@@ -206,8 +235,8 @@ steps: ...@@ -206,8 +235,8 @@ steps:
commands: commands:
- pytest -v -s distributed/test_eplb_algo.py - pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min - label: EPLB Execution Test # 10min
timeout_in_minutes: 15 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -215,6 +244,7 @@ steps: ...@@ -215,6 +244,7 @@ steps:
- tests/distributed/test_eplb_execute.py - tests/distributed/test_eplb_execute.py
commands: commands:
- pytest -v -s distributed/test_eplb_execute.py - pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
- label: Metrics, Tracing Test # 12min - label: Metrics, Tracing Test # 12min
timeout_in_minutes: 20 timeout_in_minutes: 20
...@@ -289,27 +319,56 @@ steps: ...@@ -289,27 +319,56 @@ steps:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor - pytest -v -s v1/executor
- pytest -v -s v1/kv_offload - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode - pytest -v -s v1/spec_decode
- pytest -v -s v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s v1/metrics - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_kv_sharing.py
- pytest -v -s v1/test_metrics_reader.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py - pytest -v -s v1/test_request.py
- pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_outputs.py
- pytest -v -s v1/test_utils.py
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Test attention (H100) # 10min
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Examples Test # 30min - label: Examples Test # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -334,7 +393,8 @@ steps: ...@@ -334,7 +393,8 @@ steps:
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Platform Tests (CUDA) # 4min - label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15 timeout_in_minutes: 15
...@@ -369,7 +429,12 @@ steps: ...@@ -369,7 +429,12 @@ steps:
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_chatglm3_tp.py \
--ignore=lora/test_llama_tp.py \ --ignore=lora/test_llama_tp.py \
--ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_llm_with_multi_loras.py \
--ignore=lora/test_olmoe_tp.py \
--ignore=lora/test_deepseekv2_tp.py \
--ignore=lora/test_gptoss_tp.py \
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests # 15min - label: PyTorch Compilation Unit Tests # 15min
...@@ -380,15 +445,18 @@ steps: ...@@ -380,15 +445,18 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_graph_partition.py
- pytest -v -s compile/test_config.py
- pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py - pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- pytest -v -s compile/test_qk_norm_rope_fusion.py
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -399,17 +467,34 @@ steps: ...@@ -399,17 +467,34 @@ steps:
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/test_multimodal_compile.py
- pytest -v -s compile/piecewise/ - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min - label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 30 timeout_in_minutes: 40
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py # fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Cudagraph test
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
- label: Kernels Core Operation Test # 48min - label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75 timeout_in_minutes: 75
...@@ -417,8 +502,9 @@ steps: ...@@ -417,8 +502,9 @@ steps:
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core - tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands: commands:
- pytest -v -s kernels/core - pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min - label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
...@@ -452,6 +538,8 @@ steps: ...@@ -452,6 +538,8 @@ steps:
- tests/kernels/moe - tests/kernels/moe
- vllm/model_executor/layers/fused_moe/ - vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/ - vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands: commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 2
...@@ -462,32 +550,25 @@ steps: ...@@ -462,32 +550,25 @@ steps:
source_file_dependencies: source_file_dependencies:
- csrc/mamba/ - csrc/mamba/
- tests/kernels/mamba - tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands: commands:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Tensorizer Test # 14min - label: Model Executor Test # 23min
timeout_in_minutes: 25 timeout_in_minutes: 35
mirror_hardwares: [amdexperimental] torch_nightly: true
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/engine/arg_utils.py
- vllm/config/model.py
- vllm/model_executor - vllm/model_executor
- tests/model_executor - tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor - pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Benchmarks # 11min - label: Benchmarks # 11min
timeout_in_minutes: 20 timeout_in_minutes: 20
...@@ -521,8 +602,9 @@ steps: ...@@ -521,8 +602,9 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 # TODO(jerryzh168): resolve the above comment
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
timeout_in_minutes: 75 timeout_in_minutes: 75
...@@ -530,6 +612,7 @@ steps: ...@@ -530,6 +612,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
...@@ -550,10 +633,17 @@ steps: ...@@ -550,10 +633,17 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
- tests/mistral_tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s -m 'not cpu_test' tool_use
- pytest -v -s mistral_tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test ##### ##### models test #####
...@@ -593,13 +683,19 @@ steps: ...@@ -593,13 +683,19 @@ steps:
- vllm/ - vllm/
- tests/models/test_transformers.py - tests/models/test_transformers.py
- tests/models/test_registry.py - tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
timeout_in_minutes: 10
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_utils.py - tests/models/test_utils.py
- tests/models/test_vision.py - tests/models/test_vision.py
no_gpu: true
commands: commands:
- pytest -v -s models/test_transformers.py \ - pytest -v -s models/test_utils.py models/test_vision.py
models/test_registry.py \
models/test_utils.py \
models/test_vision.py
- label: Language Models Tests (Standard) - label: Language Models Tests (Standard)
timeout_in_minutes: 25 timeout_in_minutes: 25
...@@ -658,8 +754,10 @@ steps: ...@@ -658,8 +754,10 @@ steps:
- vllm/ - vllm/
- tests/models/language/generation - tests/models/language/generation
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install fast path packages for testing against transformers
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL) - label: Language Models Test (PPL)
...@@ -714,6 +812,16 @@ steps: ...@@ -714,6 +812,16 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true
...@@ -768,16 +876,17 @@ steps: ...@@ -768,16 +876,17 @@ steps:
optional: true optional: true
commands: commands:
- pip install --upgrade git+https://github.com/huggingface/transformers - pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/test_mapping.py # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min - label: Blackwell Test # 21 min
timeout_in_minutes: 60 timeout_in_minutes: 30
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
# optional: true # optional: true
...@@ -790,13 +899,16 @@ steps: ...@@ -790,13 +899,16 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py - vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/compilation/fusion_attn.py - vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
...@@ -808,19 +920,64 @@ steps: ...@@ -808,19 +920,64 @@ steps:
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: GPT-OSS Eval (Blackwell) - label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusions_e2e.py
- tests/compile/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
optional: true # disable while debugging optional: true # run on nightlies
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py - vllm/model_executor/models/gpt_oss.py
...@@ -828,7 +985,34 @@ steps: ...@@ -828,7 +985,34 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
commands: commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5' - uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2' - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -889,19 +1073,21 @@ steps: ...@@ -889,19 +1073,21 @@ steps:
- tests/compile/test_wrapper.py - tests/compile/test_wrapper.py
- tests/distributed/ - tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py - tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py - tests/v1/distributed
- tests/v1/test_external_lb_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown - tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py # https://github.com/NVIDIA/nccl/issues/1838
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py - pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py - pytest -v -s v1/worker/test_worker_memory_snapshot.py
...@@ -945,6 +1131,11 @@ steps: ...@@ -945,6 +1131,11 @@ steps:
- pytest -v -s plugins_tests/test_io_processor_plugins.py - pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test # end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here: # other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py - pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
...@@ -984,6 +1175,8 @@ steps: ...@@ -984,6 +1175,8 @@ steps:
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
...@@ -1010,6 +1203,17 @@ steps: ...@@ -1010,6 +1203,17 @@ steps:
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test ##### ##### multi gpus test #####
##### A100 test ##### ##### A100 test #####
...@@ -1040,15 +1244,34 @@ steps: ...@@ -1040,15 +1244,34 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test ##### ##### H200 test #####
- label: Distrubted Tests (H200) # optional - label: Distributed Tests (H200) # optional
gpu: h200 gpu: h200
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
- label: Distributed Tests (B200) # optional - label: Distributed Tests (B200) # optional
...@@ -1059,6 +1282,7 @@ steps: ...@@ -1059,6 +1282,7 @@ steps:
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests ##### ##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min - label: Prime-RL Integration Test # 15min
...@@ -1071,3 +1295,21 @@ steps: ...@@ -1071,3 +1295,21 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
[run] [run]
source = vllm # Track the installed vllm package (this is what actually gets imported during tests)
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit = omit =
*/tests/* */tests/*
*/test_* */test_*
...@@ -12,6 +17,16 @@ omit = ...@@ -12,6 +17,16 @@ omit =
*/benchmarks/* */benchmarks/*
*/docs/* */docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report] [report]
exclude_lines = exclude_lines =
pragma: no cover pragma: no cover
......
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f
...@@ -3,17 +3,13 @@ ...@@ -3,17 +3,13 @@
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/v1/attention @LucasWilkinson
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang /vllm/reasoning @aarnphm @chaunceyjiang
...@@ -24,44 +20,57 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson ...@@ -24,44 +20,57 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact, # Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people # so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1/attention @LucasWilkinson
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC /vllm/v1/offloading @ApostaC
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
/tests/evals @mgoin /tests/evals @mgoin
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee /tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep /tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC /tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC /tests/v1/offloading @ApostaC
# Transformers backend # Transformers modeling backend
/vllm/model_executor/models/transformers.py @hmellor /vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor /tests/models/test_transformers.py @hmellor
# Observability
/vllm/config/observability.py @markmc
/vllm/v1/metrics @markmc
/tests/v1/metrics @markmc
/vllm/tracing.py @markmc
/tests/v1/tracing/test_tracing.py @markmc
/vllm/config/kv_events.py @markmc
/vllm/distributed/kv_events.py @markmc
/tests/distributed/test_events.py @markmc
# Docs # Docs
/docs/mkdocs @hmellor /docs/mkdocs @hmellor
/docs/**/*.yml @hmellor /docs/**/*.yml @hmellor
...@@ -106,11 +115,21 @@ mkdocs.yaml @hmellor ...@@ -106,11 +115,21 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/triton_unified_attention.py @tdoublep /vllm/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review # ROCm related: specify owner with write access to notify AMD folks for careful code review
/docker/Dockerfile.rocm* @gshtras /vllm/**/*rocm* @tjtanaa
/vllm/v1/attention/backends/rocm*.py @gshtras /docker/Dockerfile.rocm* @gshtras @tjtanaa
/vllm/v1/attention/backends/mla/rocm*.py @gshtras /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
/vllm/attention/ops/rocm*.py @gshtras /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras /vllm/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
/csrc/rocm @gshtras @tjtanaa
/requirements/*rocm* @tjtanaa
/tests/**/*rocm* @tjtanaa
/docs/**/*rocm* @tjtanaa
/vllm/**/*quark* @tjtanaa
/tests/**/*quark* @tjtanaa
/docs/**/*quark* @tjtanaa
/vllm/**/*aiter* @tjtanaa
/tests/**/*aiter* @tjtanaa
# TPU # TPU
/vllm/v1/worker/tpu* @NickLucche /vllm/v1/worker/tpu* @NickLucche
...@@ -120,3 +139,16 @@ mkdocs.yaml @hmellor ...@@ -120,3 +139,16 @@ mkdocs.yaml @hmellor
# KVConnector installation files # KVConnector installation files
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop
# Security guide and policies
/docs/usage/security.md @russellb
/SECURITY.md @russellb
/docs/contributing/vulnerability_management.md @russellb
...@@ -2,6 +2,7 @@ pull_request_rules: ...@@ -2,6 +2,7 @@ pull_request_rules:
- name: label-documentation - name: label-documentation
description: Automatically apply documentation label description: Automatically apply documentation label
conditions: conditions:
- label != stale
- or: - or:
- files~=^[^/]+\.md$ - files~=^[^/]+\.md$
- files~=^docs/ - files~=^docs/
...@@ -10,10 +11,13 @@ pull_request_rules: ...@@ -10,10 +11,13 @@ pull_request_rules:
label: label:
add: add:
- documentation - documentation
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build - name: label-ci-build
description: Automatically apply ci/build label description: Automatically apply ci/build label
conditions: conditions:
- label != stale
- or: - or:
- files~=^\.github/ - files~=^\.github/
- files~=\.buildkite/ - files~=\.buildkite/
...@@ -30,6 +34,7 @@ pull_request_rules: ...@@ -30,6 +34,7 @@ pull_request_rules:
- name: label-deepseek - name: label-deepseek
description: Automatically apply deepseek label description: Automatically apply deepseek label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*deepseek.*\.py - files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py - files~=^tests/.*deepseek.*\.py
...@@ -46,6 +51,7 @@ pull_request_rules: ...@@ -46,6 +51,7 @@ pull_request_rules:
- name: label-frontend - name: label-frontend
description: Automatically apply frontend label description: Automatically apply frontend label
conditions: conditions:
- label != stale
- files~=^vllm/entrypoints/ - files~=^vllm/entrypoints/
actions: actions:
label: label:
...@@ -55,6 +61,7 @@ pull_request_rules: ...@@ -55,6 +61,7 @@ pull_request_rules:
- name: label-llama - name: label-llama
description: Automatically apply llama label description: Automatically apply llama label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*llama.*\.py - files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py - files~=^tests/.*llama.*\.py
...@@ -70,6 +77,7 @@ pull_request_rules: ...@@ -70,6 +77,7 @@ pull_request_rules:
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/multimodal/ - files~=^vllm/multimodal/
- files~=^tests/multimodal/ - files~=^tests/multimodal/
...@@ -83,6 +91,7 @@ pull_request_rules: ...@@ -83,6 +91,7 @@ pull_request_rules:
- name: label-new-model - name: label-new-model
description: Automatically apply new-model label description: Automatically apply new-model label
conditions: conditions:
- label != stale
- and: - and:
- files~=^vllm/model_executor/models/ - files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py - files=vllm/model_executor/models/registry.py
...@@ -94,11 +103,12 @@ pull_request_rules: ...@@ -94,11 +103,12 @@ pull_request_rules:
- name: label-performance - name: label-performance
description: Automatically apply performance label description: Automatically apply performance label
conditions: conditions:
- label != stale
- or: - or:
- files~=^benchmarks/ - files~=^benchmarks/
- files~=^vllm/benchmarks/ - files~=^vllm/benchmarks/
- files~=^tests/benchmarks/ - files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/ - files~=^\.buildkite/performance-benchmarks/
actions: actions:
label: label:
add: add:
...@@ -107,6 +117,7 @@ pull_request_rules: ...@@ -107,6 +117,7 @@ pull_request_rules:
- name: label-qwen - name: label-qwen
description: Automatically apply qwen label description: Automatically apply qwen label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*qwen.*\.py - files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py - files~=^tests/.*qwen.*\.py
...@@ -121,6 +132,7 @@ pull_request_rules: ...@@ -121,6 +132,7 @@ pull_request_rules:
- name: label-gpt-oss - name: label-gpt-oss
description: Automatically apply gpt-oss label description: Automatically apply gpt-oss label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*gpt[-_]?oss.*\.py - files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py - files~=^tests/.*gpt[-_]?oss.*\.py
...@@ -139,9 +151,27 @@ pull_request_rules: ...@@ -139,9 +151,27 @@ pull_request_rules:
add: add:
- gpt-oss - gpt-oss
- name: label-nvidia
description: Automatically apply nvidia label
conditions:
- label != stale
- or:
- files~=cuda
- files~=cutlass
- files~=flashinfer
- files~=trtllm
- title~=(?i)NVIDIA
- title~=(?i)CUDA
- title~=(?i)CUTLASS
actions:
label:
add:
- nvidia
- name: label-rocm - name: label-rocm
description: Automatically apply rocm label description: Automatically apply rocm label
conditions: conditions:
- label != stale
- or: - or:
- files~=^csrc/rocm/ - files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm - files~=^docker/Dockerfile.rocm
...@@ -162,6 +192,7 @@ pull_request_rules: ...@@ -162,6 +192,7 @@ pull_request_rules:
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
- label != stale
- or: - or:
- files~=^benchmarks/structured_schemas/ - files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/benchmark_serving_structured_output.py
...@@ -181,6 +212,7 @@ pull_request_rules: ...@@ -181,6 +212,7 @@ pull_request_rules:
- name: label-speculative-decoding - name: label-speculative-decoding
description: Automatically apply speculative-decoding label description: Automatically apply speculative-decoding label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/v1/spec_decode/ - files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/ - files~=^tests/v1/spec_decode/
...@@ -196,6 +228,7 @@ pull_request_rules: ...@@ -196,6 +228,7 @@ pull_request_rules:
- name: label-v1 - name: label-v1
description: Automatically apply v1 label description: Automatically apply v1 label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/v1/ - files~=^vllm/v1/
- files~=^tests/v1/ - files~=^tests/v1/
...@@ -208,6 +241,7 @@ pull_request_rules: ...@@ -208,6 +241,7 @@ pull_request_rules:
description: Automatically apply tpu label description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions # Keep this list in sync with `label-tpu-remove` conditions
conditions: conditions:
- label != stale
- or: - or:
- files~=tpu.py - files~=tpu.py
- files~=_tpu - files~=_tpu
...@@ -223,6 +257,7 @@ pull_request_rules: ...@@ -223,6 +257,7 @@ pull_request_rules:
description: Automatically remove tpu label description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions # Keep this list in sync with `label-tpu` conditions
conditions: conditions:
- label != stale
- and: - and:
- -files~=tpu.py - -files~=tpu.py
- -files~=_tpu - -files~=_tpu
...@@ -237,9 +272,9 @@ pull_request_rules: ...@@ -237,9 +272,9 @@ pull_request_rules:
- name: label-tool-calling - name: label-tool-calling
description: Automatically add tool-calling label description: Automatically add tool-calling label
conditions: conditions:
- label != stale
- or: - or:
- files~=^tests/tool_use/ - files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/ - files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/ - files~=^vllm/entrypoints/openai/tool_parsers/
...@@ -256,8 +291,9 @@ pull_request_rules: ...@@ -256,8 +291,9 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label - name: ping author on conflicts and add 'needs-rebase' label
conditions: conditions:
- conflict - label != stale
- -closed - conflict
- -closed
actions: actions:
label: label:
add: add:
...@@ -271,10 +307,12 @@ pull_request_rules: ...@@ -271,10 +307,12 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes - name: assign reviewer for tensorizer changes
conditions: conditions:
- label != stale
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py - files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/ - files~=^tests/model_executor/model_loader/tensorizer_loader/
actions: actions:
assign: assign:
users: users:
...@@ -282,6 +320,7 @@ pull_request_rules: ...@@ -282,6 +320,7 @@ pull_request_rules:
- name: assign reviewer for modelopt changes - name: assign reviewer for modelopt changes
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
...@@ -296,8 +335,8 @@ pull_request_rules: ...@@ -296,8 +335,8 @@ pull_request_rules:
- name: remove 'needs-rebase' label when conflict is resolved - name: remove 'needs-rebase' label when conflict is resolved
conditions: conditions:
- -conflict - -conflict
- -closed - -closed
actions: actions:
label: label:
remove: remove:
...@@ -306,6 +345,7 @@ pull_request_rules: ...@@ -306,6 +345,7 @@ pull_request_rules:
- name: label-kv-connector - name: label-kv-connector
description: Automatically apply kv-connector label description: Automatically apply kv-connector label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/online_serving/disaggregated[^/]*/.* - files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.* - files~=^examples/offline_inference/disaggregated[^/]*/.*
......
...@@ -13,6 +13,7 @@ jobs: ...@@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Label issues based on keywords - name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with: with:
script: | script: |
...@@ -42,7 +43,6 @@ jobs: ...@@ -42,7 +43,6 @@ jobs:
searchIn: "body" searchIn: "body"
}, },
], ],
// Substring search - matches anywhere in text (partial matches) // Substring search - matches anywhere in text (partial matches)
substrings: [ substrings: [
{ {
...@@ -89,14 +89,12 @@ jobs: ...@@ -89,14 +89,12 @@ jobs:
term: "hip_", term: "hip_",
searchIn: "both" searchIn: "both"
}, },
// ROCm tools and libraries // ROCm tools and libraries
{ {
term: "hipify", term: "hipify",
searchIn: "both" searchIn: "both"
}, },
], ],
// Regex patterns - for complex pattern matching // Regex patterns - for complex pattern matching
regexPatterns: [ regexPatterns: [
{ {
...@@ -107,13 +105,17 @@ jobs: ...@@ -107,13 +105,17 @@ jobs:
} }
], ],
}, },
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
}; };
// Helper function to create regex based on search type // Helper function to create regex based on search type
function createSearchRegex(term, type) { function createSearchRegex(term, type) {
// Escape special regex characters in the term // Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) { switch (type) {
case 'keyword': case 'keyword':
// Word boundary search - matches whole words only // Word boundary search - matches whole words only
...@@ -125,16 +127,13 @@ jobs: ...@@ -125,16 +127,13 @@ jobs:
throw new Error(`Unknown search type: ${type}`); throw new Error(`Unknown search type: ${type}`);
} }
} }
// Helper function to find matching terms in text with line information // Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = []; const matches = [];
const lines = text.split('\n'); const lines = text.split('\n');
for (const termConfig of searchTerms) { for (const termConfig of searchTerms) {
let regex; let regex;
let term, searchIn, pattern, description, flags; let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object) // Handle different input formats (string or object)
if (typeof termConfig === 'string') { if (typeof termConfig === 'string') {
term = termConfig; term = termConfig;
...@@ -146,21 +145,17 @@ jobs: ...@@ -146,21 +145,17 @@ jobs:
description = termConfig.description; description = termConfig.description;
flags = termConfig.flags; flags = termConfig.flags;
} }
// Skip if this term shouldn't be searched in the current location // Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) { if (searchIn !== 'both' && searchIn !== searchLocation) {
continue; continue;
} }
// Create appropriate regex // Create appropriate regex
if (searchType === 'regex') { if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi"); regex = new RegExp(pattern, flags || "gi");
} else { } else {
regex = createSearchRegex(term, searchType); regex = createSearchRegex(term, searchType);
} }
const termMatches = []; const termMatches = [];
// Check each line for matches // Check each line for matches
lines.forEach((line, lineIndex) => { lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex); const lineMatches = line.match(regex);
...@@ -175,15 +170,14 @@ jobs: ...@@ -175,15 +170,14 @@ jobs:
originalTerm: term || pattern, originalTerm: term || pattern,
description: description, description: description,
// Show context around the match in the line // Show context around the match in the line
context: line.length > 100 ? context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim() : line.trim()
}); });
}); });
} }
}); });
if (termMatches.length > 0) { if (termMatches.length > 0) {
matches.push({ matches.push({
term: term || (description || pattern), term: term || (description || pattern),
...@@ -196,64 +190,48 @@ jobs: ...@@ -196,64 +190,48 @@ jobs:
}); });
} }
} }
return matches; return matches;
} }
// Helper function to check if label should be added // Helper function to check if label should be added
async function processLabel(labelName, config) { async function processLabel(labelName, config) {
const body = context.payload.issue.body || ""; const body = context.payload.issue.body || "";
const title = context.payload.issue.title || ""; const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`); core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`); core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`); core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false; let shouldAddLabel = false;
let allMatches = []; let allMatches = [];
let reason = ''; let reason = '';
const keywords = config.keywords || []; const keywords = config.keywords || [];
const substrings = config.substrings || []; const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || []; const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title // Search in title
if (title.trim()) { if (title.trim()) {
core.notice(`Searching in title: "${title}"`); core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
} }
// Search in body // Search in body
if (body.trim()) { if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`); core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
} }
if (allMatches.length > 0) { if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`); core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) { for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') { if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else { } else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} }
// Show details for each match // Show details for each match
termMatch.matches.forEach((match, index) => { termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
...@@ -266,7 +244,6 @@ jobs: ...@@ -266,7 +244,6 @@ jobs:
} }
}); });
} }
shouldAddLabel = true; shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
...@@ -274,13 +251,10 @@ jobs: ...@@ -274,13 +251,10 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
} }
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) { if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name); const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) { if (!existingLabels.includes(labelName)) {
...@@ -296,14 +270,92 @@ jobs: ...@@ -296,14 +270,92 @@ jobs:
core.notice(`Label "${labelName}" already present.`); core.notice(`Label "${labelName}" already present.`);
return false; return false;
} }
core.notice(`No matching terms found for label "${labelName}".`); core.notice(`No matching terms found for label "${labelName}".`);
return false; return false;
} }
// Process all configured labels // Process all configured labels
const processLabels = Object.entries(labelConfig) const labelsAddedResults = await Promise.all(
.map(([labelName, config]) => processLabel(labelName, config)); Object.entries(labelConfig).map(([labelName, config]) =>
const labelsAdded = await Promise.all(processLabels); processLabel(labelName, config).then(added => ({ labelName, added }))
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); )
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); );
\ No newline at end of file
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}
\ No newline at end of file
name: macOS Apple Silicon Smoke Test
on:
push:
branches:
- main
workflow_dispatch: # Manual trigger
jobs:
macos-m1-smoke-test:
runs-on: macos-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: |
requirements/**/*.txt
pyproject.toml
python-version: '3.12'
- name: Create virtual environment
run: |
uv venv
echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
- name: Install dependencies and build vLLM
run: |
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
uv pip install -e .
env:
CMAKE_BUILD_PARALLEL_LEVEL: 4
- name: Verify installation
run: |
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
- name: Smoke test vllm serve
timeout-minutes: 10
run: |
# Start server in background
vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2048 \
--load-format=dummy \
--enforce-eager \
--port 8000 &
SERVER_PID=$!
# Wait for server to start
for i in {1..30}; do
if curl -s http://localhost:8000/health > /dev/null; then
echo "Server started successfully"
break
fi
if [ "$i" -eq 30 ]; then
echo "Server failed to start"
kill "$SERVER_PID"
exit 1
fi
sleep 2
done
# Test health endpoint
curl -f http://localhost:8000/health
# Test completion
curl -f http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"prompt": "Hello",
"max_tokens": 5
}'
# Cleanup
kill "$SERVER_PID"
...@@ -13,7 +13,7 @@ jobs: ...@@ -13,7 +13,7 @@ jobs:
actions: write actions: write
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0 - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with: with:
# Increasing this value ensures that changes to this workflow # Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months # propagate to all issues and PRs in days rather than months
......
...@@ -94,6 +94,9 @@ ipython_config.py ...@@ -94,6 +94,9 @@ ipython_config.py
# generated files # generated files
**/generated/** **/generated/**
# uv
uv.lock
# pyenv # pyenv
# For a library or package, you might want to ignore these files since the code is # For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in: # intended to run in multiple environments; otherwise, check them in:
...@@ -218,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_* ...@@ -218,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder # Ignore ep_kernels_workspace folder
ep_kernels_workspace/ ep_kernels_workspace/
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
!vllm/benchmarks/lib/
...@@ -3,11 +3,9 @@ MD007: ...@@ -3,11 +3,9 @@ MD007:
MD013: false MD013: false
MD024: MD024:
siblings_only: true siblings_only: true
MD031:
list_items: false
MD033: false MD033: false
MD042: false
MD045: false
MD046: false MD046: false
MD051: false
MD052: false MD052: false
MD053: false
MD059: false MD059: false
...@@ -6,30 +6,19 @@ default_stages: ...@@ -6,30 +6,19 @@ default_stages:
- manual # Run in CI - manual # Run in CI
exclude: 'vllm/third_party/.*' exclude: 'vllm/third_party/.*'
repos: repos:
- repo: https://github.com/google/yapf
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.14.0
hooks: hooks:
- id: ruff - id: ruff-check
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos - repo: https://github.com/crate-ci/typos
rev: v1.35.5 rev: v1.38.1
hooks: hooks:
- id: typos - id: typos
- repo: https://github.com/PyCQA/isort args: [--force-exclude]
rev: 6.0.1
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3 rev: v21.1.2
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
...@@ -46,32 +35,27 @@ repos: ...@@ -46,32 +35,27 @@ repos:
hooks: hooks:
- id: actionlint - id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit - repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.17 rev: 0.9.1
hooks: hooks:
- id: pip-compile - id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
- id: format-torch-nightly-test - id: format-torch-nightly-test
name: reformat nightly_torch_test.txt to be in sync with test.in name: reformat nightly_torch_test.txt to be in sync with test.in
language: python language: python
entry: python tools/generate_nightly_torch_test.py entry: python tools/pre_commit/generate_nightly_torch_test.py
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- id: mypy-local - id: mypy-local
name: Run mypy for local Python installation name: Run mypy locally for lowest supported Python version
entry: python tools/pre_commit/mypy.py 0 "local" entry: python tools/pre_commit/mypy.py 0 "3.10"
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
<<: &mypy_common <<: &mypy_common
language: python language: python
types_or: [python, pyi] types_or: [python, pyi]
require_serial: true require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: python tools/pre_commit/mypy.py 1 "3.9"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10 name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10" entry: python tools/pre_commit/mypy.py 1 "3.10"
...@@ -87,14 +71,19 @@ repos: ...@@ -87,14 +71,19 @@ repos:
entry: python tools/pre_commit/mypy.py 1 "3.12" entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common <<: *mypy_common
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.13
entry: python tools/pre_commit/mypy.py 1 "3.13"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck - id: shellcheck
name: Lint shell scripts name: Lint shell scripts
entry: tools/shellcheck.sh entry: tools/pre_commit/shellcheck.sh
language: script language: script
types: [shell] types: [shell]
- id: png-lint - id: png-lint
name: Lint PNG exports from excalidraw name: Lint PNG exports from excalidraw
entry: tools/png-lint.sh entry: tools/pre_commit/png-lint.sh
language: script language: script
types: [png] types: [png]
- id: signoff-commit - id: signoff-commit
...@@ -111,12 +100,12 @@ repos: ...@@ -111,12 +100,12 @@ repos:
stages: [commit-msg] stages: [commit-msg]
- id: check-spdx-header - id: check-spdx-header
name: Check SPDX headers name: Check SPDX headers
entry: python tools/check_spdx_header.py entry: python tools/pre_commit/check_spdx_header.py
language: python language: python
types: [python] types: [python]
- id: check-root-lazy-imports - id: check-root-lazy-imports
name: Check root lazy imports name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py entry: python tools/pre_commit/check_init_lazy_imports.py
language: python language: python
types: [python] types: [python]
- id: check-filenames - id: check-filenames
...@@ -130,11 +119,11 @@ repos: ...@@ -130,11 +119,11 @@ repos:
pass_filenames: false pass_filenames: false
- id: update-dockerfile-graph - id: update-dockerfile-graph
name: Update Dockerfile dependency graph name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh entry: tools/pre_commit/update-dockerfile-graph.sh
language: script language: script
- id: enforce-import-regex-instead-of-re - id: enforce-import-regex-instead-of-re
name: Enforce import regex as re name: Enforce import regex as re
entry: python tools/enforce_regex_import.py entry: python tools/pre_commit/enforce_regex_import.py
language: python language: python
types: [python] types: [python]
pass_filenames: false pass_filenames: false
...@@ -142,7 +131,7 @@ repos: ...@@ -142,7 +131,7 @@ repos:
# forbid directly import triton # forbid directly import triton
- id: forbid-direct-triton-import - id: forbid-direct-triton-import
name: "Forbid direct 'import triton'" name: "Forbid direct 'import triton'"
entry: python tools/check_triton_import.py entry: python tools/pre_commit/check_triton_import.py
language: python language: python
types: [python] types: [python]
pass_filenames: false pass_filenames: false
...@@ -155,7 +144,7 @@ repos: ...@@ -155,7 +144,7 @@ repos:
additional_dependencies: [regex] additional_dependencies: [regex]
- id: validate-config - id: validate-config
name: Validate configuration has default values and that each field has a docstring name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py entry: python tools/pre_commit/validate_config.py
language: python language: python
additional_dependencies: [regex] additional_dependencies: [regex]
# Keep `suggestion` last # Keep `suggestion` last
......
...@@ -34,11 +34,18 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) ...@@ -34,11 +34,18 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
# #
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13") set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx928;gfx936") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151;gfx928;gfx936")
# ROCm installation prefix. Default to /opt/rocm but allow override via
# -DROCM_PATH=/your/rocm/path when invoking cmake.
if(NOT DEFINED ROCM_PATH)
set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
else()
set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
endif()
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
# #
...@@ -49,8 +56,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 ...@@ -49,8 +56,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
...@@ -86,6 +93,9 @@ find_package(Torch REQUIRED) ...@@ -86,6 +93,9 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures. # Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else() else()
...@@ -175,6 +185,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -175,6 +185,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Set compression mode for CUDA >=13.x.
#
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
endif()
# #
# Set CUDA include flags for CXX compiler. # Set CUDA include flags for CXX compiler.
# #
...@@ -225,11 +244,28 @@ set_gencode_flags_for_srcs( ...@@ -225,11 +244,28 @@ set_gencode_flags_for_srcs(
SRCS "${VLLM_CUMEM_EXT_SRC}" SRCS "${VLLM_CUMEM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling cumem allocator extension.") message(STATUS "Enabling cumem allocator extension.")
# link against cuda driver library if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND CUMEM_LIBS CUDA::cuda_driver) # link against cuda driver library
define_gpu_extension_target( list(APPEND CUMEM_LIBS CUDA::cuda_driver)
else()
# link against rocm driver library. Prefer an absolute path to
# libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
# back to linking by name "amdhip64".
find_library(AMDHIP64_LIB
NAMES amdhip64 libamdhip64.so
PATHS ${ROCM_PATH}/lib
NO_DEFAULT_PATH)
if(AMDHIP64_LIB)
message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
else()
message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
list(APPEND CUMEM_LIBS amdhip64)
endif()
endif()
define_extension_target(
cumem_allocator cumem_allocator
DESTINATION vllm DESTINATION vllm
LANGUAGE CXX LANGUAGE CXX
...@@ -253,13 +289,13 @@ set(VLLM_EXT_SRC ...@@ -253,13 +289,13 @@ set(VLLM_EXT_SRC
"csrc/pos_encoding_kernels.cu" "csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/opt/transpose_kernels.cu" "csrc/fused_qknorm_rope_kernel.cu"
# "csrc/layernorm_quant_kernels.cu" # "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu" "csrc/sampler.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu"
# "csrc/quantization/fp8/common.cu" # "csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
# "csrc/quantization/activation_kernels.cu" # "csrc/quantization/activation_kernels.cu"
...@@ -271,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -271,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
...@@ -303,13 +339,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -303,13 +339,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp" "csrc/cutlass_extensions/common.cpp"
"csrc/quantization/fp8/per_token_group_quant.cu") "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}" SRCS "${VLLM_EXT_SRC}"
...@@ -319,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -319,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
...@@ -413,11 +449,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -413,11 +449,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
...@@ -441,12 +477,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -441,12 +477,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later # CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
...@@ -471,12 +511,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -471,12 +511,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later # require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
...@@ -507,7 +551,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -507,7 +551,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
...@@ -551,7 +595,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -551,7 +595,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later # CUDA 12.8 or later
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
...@@ -570,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -570,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# FP4 Archs and flags # FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
...@@ -592,7 +644,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -592,7 +644,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# CUTLASS MLA Archs and flags # CUTLASS MLA Archs and flags
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS set(SRCS
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu") "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
...@@ -618,7 +674,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -618,7 +674,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output. # if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu") set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
...@@ -636,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -636,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu") set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
...@@ -657,9 +717,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -657,9 +717,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# moe_data.cu is used by all CUTLASS MoE kernels. # moe_data.cu is used by all CUTLASS MoE kernels.
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}") CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
...@@ -676,9 +740,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -676,9 +740,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
...@@ -793,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -793,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# Hadacore kernels # Hadacore kernels
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if(HADACORE_ARCHS) if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
...@@ -815,7 +883,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP") ...@@ -815,7 +883,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
endif() endif()
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG} LANGUAGE ${VLLM_GPU_LANG}
...@@ -840,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) ...@@ -840,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
...@@ -870,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -870,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #
...@@ -929,7 +998,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -929,7 +998,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_extension_target(
_moe_C _moe_C
DESTINATION vllm DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG} LANGUAGE ${VLLM_GPU_LANG}
...@@ -951,7 +1020,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") ...@@ -951,7 +1020,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
"csrc/rocm/skinny_gemms.cu" "csrc/rocm/skinny_gemms.cu"
"csrc/rocm/attention.cu") "csrc/rocm/attention.cu")
define_gpu_extension_target( define_extension_target(
_rocm_C _rocm_C
DESTINATION vllm DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG} LANGUAGE ${VLLM_GPU_LANG}
...@@ -966,6 +1035,7 @@ endif() ...@@ -966,6 +1035,7 @@ endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions # vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)
......
...@@ -21,6 +21,10 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio ...@@ -21,6 +21,10 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥 *Latest News* 🔥
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
...@@ -81,7 +85,7 @@ vLLM is flexible and easy to use with: ...@@ -81,7 +85,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference - Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
...@@ -148,6 +152,7 @@ Compute Resources: ...@@ -148,6 +152,7 @@ Compute Resources:
- Trainy - Trainy
- UC Berkeley - UC Berkeley
- UC San Diego - UC San Diego
- Volcengine
Slack Sponsor: Anyscale Slack Sponsor: Anyscale
...@@ -178,4 +183,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ...@@ -178,4 +183,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Media Kit ## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
\ No newline at end of file
...@@ -74,7 +74,7 @@ start_server() { ...@@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5 local profile_dir=$5
pkill -if vllm pkill -if "vllm serve" || true
# Define the common arguments as a bash array. # Define the common arguments as a bash array.
# Each argument and its value are separate elements. # Each argument and its value are separate elements.
...@@ -96,11 +96,11 @@ start_server() { ...@@ -96,11 +96,11 @@ start_server() {
# This correctly passes each element as a separate argument. # This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled # Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else else
# Start server without profiling # Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi fi
local server_pid=$! local server_pid=$!
...@@ -139,7 +139,7 @@ run_benchmark() { ...@@ -139,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
pkill -if vllm pkill -if "vllm serve" || true
echo "starting server..." echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead # Call start_server without a profile_dir to avoid profiling overhead
...@@ -232,7 +232,7 @@ run_benchmark() { ...@@ -232,7 +232,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if vllm pkill -if "vllm serve" || true
sleep 10 sleep 10
echo "====================" echo "===================="
return 0 return 0
...@@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then ...@@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else else
echo "No configuration met the latency requirements. Skipping final profiling run." echo "No configuration met the latency requirements. Skipping final profiling run."
fi fi
pkill -if vllm pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
...@@ -8,7 +8,6 @@ import sys ...@@ -8,7 +8,6 @@ import sys
import time import time
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants import huggingface_hub.constants
...@@ -28,13 +27,13 @@ class RequestFuncInput: ...@@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int prompt_len: int
output_len: int output_len: int
model: str model: str
model_name: Optional[str] = None model_name: str | None = None
logprobs: Optional[int] = None logprobs: int | None = None
extra_body: Optional[dict] = None extra_body: dict | None = None
multi_modal_content: Optional[dict | list[dict]] = None multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False ignore_eos: bool = False
language: Optional[str] = None language: str | None = None
request_id: Optional[str] = None request_id: str | None = None
@dataclass @dataclass
...@@ -52,7 +51,7 @@ class RequestFuncOutput: ...@@ -52,7 +51,7 @@ class RequestFuncOutput:
async def async_request_tgi( async def async_request_tgi(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
...@@ -133,7 +132,7 @@ async def async_request_tgi( ...@@ -133,7 +132,7 @@ async def async_request_tgi(
async def async_request_trt_llm( async def async_request_trt_llm(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
...@@ -204,7 +203,7 @@ async def async_request_trt_llm( ...@@ -204,7 +203,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii( async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
...@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii( ...@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions( async def async_request_openai_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
...@@ -367,7 +366,7 @@ async def async_request_openai_completions( ...@@ -367,7 +366,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions( async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), ( assert api_url.endswith(("chat/completions", "profile")), (
...@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions( ...@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio( async def async_request_openai_audio(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: tqdm | None = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep. # Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile import soundfile
...@@ -610,7 +609,7 @@ def get_tokenizer( ...@@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
**kwargs, **kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists( if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path pretrained_model_name_or_path
): ):
......
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
This benchmark runs the same workload twice:
1. With VLLM_BATCH_INVARIANT=0 (baseline)
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
And reports the timing and throughput metrics for comparison.
Environment variables:
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
Example usage:
# Benchmark qwen3 (default)
python benchmarks/benchmark_batch_invariance.py
# Benchmark deepseek with 8 GPUs
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
python benchmarks/benchmark_batch_invariance.py
# Quick test with fewer trials
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
python benchmarks/benchmark_batch_invariance.py
"""
import contextlib
import os
import random
import time
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
"""Generate a random prompt for benchmarking."""
prompt_templates = [
"Question: What is the capital of France?\nAnswer: The capital of France is",
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
"Once upon a time in a distant galaxy, there lived",
"The old man walked slowly down the street, remembering",
"In the year 2157, humanity finally discovered",
"To implement a binary search tree in Python, first we need to",
"The algorithm works by iterating through the array and",
"Here's how to optimize database queries using indexing:",
"The Renaissance was a period in European history that",
"Climate change is caused by several factors including",
"The human brain contains approximately 86 billion neurons which",
"I've been thinking about getting a new laptop because",
"Yesterday I went to the store and bought",
"My favorite thing about summer is definitely",
]
base_prompt = random.choice(prompt_templates)
if max_words < min_words:
max_words = min_words
target_words = random.randint(min_words, max_words)
if target_words > 50:
padding_text = (
" This is an interesting topic that deserves more explanation. "
* (target_words // 50)
)
base_prompt = base_prompt + padding_text
return base_prompt
def run_benchmark_with_batch_invariant(
model: str,
tp_size: int,
max_batch_size: int,
num_trials: int,
min_prompt: int,
max_prompt: int,
max_tokens: int,
temperature: float,
gpu_mem_util: float,
max_model_len: int,
backend: str,
batch_invariant: bool,
seed: int = 12345,
) -> dict:
"""
Run the benchmark with the specified configuration.
Returns a dict with timing and throughput metrics.
"""
random.seed(seed)
# Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
os.environ["VLLM_BATCH_INVARIANT"] = "0"
print(f"\n{'=' * 80}")
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
print(f" Model: {model}")
print(f" TP Size: {tp_size}")
print(f" Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Trials: {num_trials}")
print(f" Max Tokens: {max_tokens}")
print(f"{'=' * 80}\n")
sampling = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=max_tokens,
seed=20240919,
)
needle_prompt = "There once was a "
llm = None
try:
# Create LLM engine
start_init = time.perf_counter()
llm = LLM(
model=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init
print(f"Engine initialization time: {init_time:.2f}s\n")
# Generate baseline
print("Generating baseline (warmup)...")
baseline_out = llm.generate([needle_prompt], sampling)
assert len(baseline_out) == 1
baseline_text = baseline_out[0].outputs[0].text
print(f"Baseline output: '{baseline_text[:50]}...'\n")
# Run trials and measure timing
trial_times: list[float] = []
total_tokens = 0
total_prompts = 0
for trial in range(num_trials):
# Create a batch
prompts: list[str] = []
batch_size = random.randint(max_batch_size // 2, max_batch_size)
needle_pos = random.randint(0, batch_size - 1)
for i in range(batch_size):
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(_random_prompt(min_prompt, max_prompt))
# Measure time for this trial
start_time = time.perf_counter()
outputs = llm.generate(prompts, sampling)
trial_time = time.perf_counter() - start_time
trial_times.append(trial_time)
total_prompts += len(prompts)
# Count tokens
for output in outputs:
if output.outputs:
total_tokens += len(output.outputs[0].token_ids)
print(
f"Trial {trial + 1}/{num_trials}: "
f"batch_size={batch_size}, "
f"time={trial_time:.2f}s"
)
# Verify needle output still matches
needle_output = outputs[needle_pos]
assert needle_output.prompt == needle_prompt
# Compute statistics
avg_time = sum(trial_times) / len(trial_times)
min_time = min(trial_times)
max_time = max(trial_times)
throughput = total_tokens / sum(trial_times)
prompts_per_sec = total_prompts / sum(trial_times)
print(f"\n{'=' * 80}")
print("RESULTS:")
print(f" Average time per trial: {avg_time:.2f}s")
print(f" Min time: {min_time:.2f}s")
print(f" Max time: {max_time:.2f}s")
print(f" Total tokens generated: {total_tokens}")
print(f" Total prompts processed: {total_prompts}")
print(f" Throughput: {throughput:.2f} tokens/s")
print(f" Prompts/s: {prompts_per_sec:.2f}")
print(f"{'=' * 80}\n")
return {
"init_time": init_time,
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"total_tokens": total_tokens,
"total_prompts": total_prompts,
"throughput": throughput,
"prompts_per_sec": prompts_per_sec,
"trial_times": trial_times,
}
finally:
# Cleanup
if llm is not None:
with contextlib.suppress(Exception):
llm.shutdown()
def main():
# Check platform support
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
print("ERROR: Requires CUDA and >= Hopper (SM90)")
print(f"Current platform: {current_platform.device_type}")
if current_platform.is_cuda():
print(f"Device capability: {current_platform.get_device_capability()}")
return 1
# Read configuration from environment
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
print("\n" + "=" * 80)
print("VLLM BATCH INVARIANCE BENCHMARK")
print("=" * 80)
print("\nConfiguration:")
print(f" Model: {model}")
print(f" Tensor Parallel Size: {tp_size}")
print(f" Attention Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Number of Trials: {num_trials}")
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
print(f" Max Tokens to Generate: {max_tokens}")
print(f" Temperature: {temperature}")
print(f" GPU Memory Utilization: {gpu_mem_util}")
print(f" Max Model Length: {max_model_len}")
print("=" * 80)
# Run benchmark WITHOUT batch invariance (baseline)
print("\n" + "=" * 80)
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
print("=" * 80)
baseline_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=False,
)
# Run benchmark WITH batch invariance
print("\n" + "=" * 80)
print("PHASE 2: Running WITH batch invariance")
print("=" * 80)
batch_inv_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=True,
)
# Compare results
print("\n" + "=" * 80)
print("COMPARISON: Batch Invariance vs Baseline")
print("=" * 80)
init_overhead_pct = (
(batch_inv_results["init_time"] - baseline_results["init_time"])
/ baseline_results["init_time"]
* 100
)
time_overhead_pct = (
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
/ baseline_results["avg_time"]
* 100
)
throughput_change_pct = (
(batch_inv_results["throughput"] - baseline_results["throughput"])
/ baseline_results["throughput"]
* 100
)
print("\nInitialization Time:")
print(f" Baseline: {baseline_results['init_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
print(f" Overhead: {init_overhead_pct:+.2f}%")
print("\nAverage Trial Time:")
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
print(f" Overhead: {time_overhead_pct:+.2f}%")
print("\nThroughput (tokens/s):")
print(f" Baseline: {baseline_results['throughput']:.2f}")
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
print(f" Change: {throughput_change_pct:+.2f}%")
print("\nPrompts/s:")
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if time_overhead_pct > 0:
print(
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
"overhead"
)
else:
print(
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
"faster (unexpected!)"
)
if abs(throughput_change_pct) < 1.0:
print("Throughput difference is negligible (< 1%)")
elif throughput_change_pct < 0:
print(
f"Throughput decreased by {-throughput_change_pct:.1f}% "
"with batch invariance"
)
else:
print(
f"Throughput increased by {throughput_change_pct:.1f}% "
"with batch invariance (unexpected!)"
)
print("=" * 80 + "\n")
return 0
if __name__ == "__main__":
exit(main())
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate from tabulate import tabulate
from benchmark_utils import TimeCollector from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.block_pool import BlockPool
......
...@@ -46,7 +46,7 @@ import time ...@@ -46,7 +46,7 @@ import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
def test_long_document_qa(llm=None, sampling_params=None, prompts=None): def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
......
...@@ -5,9 +5,9 @@ import time ...@@ -5,9 +5,9 @@ import time
from unittest import mock from unittest import mock
import numpy as np import numpy as np
from benchmark_utils import TimeCollector
from tabulate import tabulate from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
DeviceConfig, DeviceConfig,
...@@ -19,7 +19,7 @@ from vllm.config import ( ...@@ -19,7 +19,7 @@ from vllm.config import (
VllmConfig, VllmConfig,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
...@@ -164,7 +164,7 @@ def invoke_main() -> None: ...@@ -164,7 +164,7 @@ def invoke_main() -> None:
) )
parser.add_argument( parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch" "--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501 )
parser.add_argument( parser.add_argument(
"--num-iteration", "--num-iteration",
type=int, type=int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment