Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
...@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then ...@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \ --ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
......
...@@ -50,6 +50,7 @@ function cpu_tests() { ...@@ -50,6 +50,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py" pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test # Run basic model test
......
...@@ -39,7 +39,7 @@ docker run \ ...@@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
cd tests cd tests
pytest -v -s v1/core pytest -v -s v1/core
pytest -v -s v1/engine pytest -v -s v1/engine
......
...@@ -44,10 +44,10 @@ trap cleanup EXIT ...@@ -44,10 +44,10 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \ --data-parallel-size ${DATA_PARALLEL_SIZE} \
......
...@@ -43,12 +43,12 @@ trap cleanup EXIT ...@@ -43,12 +43,12 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 4 \ --tensor-parallel-size 4 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \ --trust-remote-code \
......
...@@ -128,7 +128,7 @@ steps: ...@@ -128,7 +128,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min - label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -148,7 +148,7 @@ steps: ...@@ -148,7 +148,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min - label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130 timeout_in_minutes: 130
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
...@@ -162,10 +162,28 @@ steps: ...@@ -162,10 +162,28 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- label: Entrypoints Integration Test (Pooling) - label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -722,7 +740,7 @@ steps: ...@@ -722,7 +740,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 - uv pip install --system torchao==0.14.1
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...@@ -736,7 +754,7 @@ steps: ...@@ -736,7 +754,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 10min - label: OpenAI API correctness # 10min
timeout_in_minutes: 15 timeout_in_minutes: 15
...@@ -751,17 +769,6 @@ steps: ...@@ -751,17 +769,6 @@ steps:
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test ##### ##### models test #####
...@@ -1196,7 +1203,7 @@ steps: ...@@ -1196,7 +1203,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -1490,7 +1497,7 @@ steps: ...@@ -1490,7 +1497,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
...@@ -1514,7 +1521,7 @@ steps: ...@@ -1514,7 +1521,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 Card) - label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
......
...@@ -114,7 +114,7 @@ steps: ...@@ -114,7 +114,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min - label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -132,7 +132,7 @@ steps: ...@@ -132,7 +132,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min - label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130 timeout_in_minutes: 130
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
...@@ -144,10 +144,26 @@ steps: ...@@ -144,10 +144,26 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
- label: Entrypoints Integration Test (Pooling) - label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -303,7 +319,10 @@ steps: ...@@ -303,7 +319,10 @@ steps:
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
- pytest -v -s v1/engine # Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
...@@ -642,7 +661,7 @@ steps: ...@@ -642,7 +661,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...@@ -654,7 +673,7 @@ steps: ...@@ -654,7 +673,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 22min - label: OpenAI API correctness # 22min
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -666,16 +685,6 @@ steps: ...@@ -666,16 +685,6 @@ steps:
commands: # LMEval+Transcription WER check commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test ##### ##### models test #####
- label: Basic Models Tests (Initialization) - label: Basic Models Tests (Initialization)
...@@ -1064,7 +1073,7 @@ steps: ...@@ -1064,7 +1073,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -1325,7 +1334,7 @@ steps: ...@@ -1325,7 +1334,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
......
...@@ -145,7 +145,7 @@ steps: ...@@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
......
...@@ -32,6 +32,7 @@ steps: ...@@ -32,6 +32,7 @@ steps:
- label: Prime-RL Integration (2 GPUs) - label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
optional: true optional: true
soft_fail: true
num_gpus: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies: source_file_dependencies:
...@@ -39,21 +40,3 @@ steps: ...@@ -39,21 +40,3 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
...@@ -10,7 +10,7 @@ steps: ...@@ -10,7 +10,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM) - label: Entrypoints Integration (LLM)
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -25,7 +25,7 @@ steps: ...@@ -25,7 +25,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server) - label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130 timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
...@@ -34,11 +34,26 @@ steps: ...@@ -34,11 +34,26 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling) - label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
......
...@@ -9,7 +9,7 @@ steps: ...@@ -9,7 +9,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100) - label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100 gpu: a100
...@@ -43,4 +43,4 @@ steps: ...@@ -43,4 +43,4 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
...@@ -22,6 +22,8 @@ steps: ...@@ -22,6 +22,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test # FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it # before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Alot of these tests are on the edge of OOMing
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that # There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
......
...@@ -9,6 +9,7 @@ steps: ...@@ -9,6 +9,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Run a subset of model initialization tests # Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
...@@ -20,6 +21,7 @@ steps: ...@@ -20,6 +21,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Only when vLLM model source is modified - test initialization of a large # Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above # subset of supported models (the complement of the small subset in the above
......
...@@ -13,7 +13,9 @@ steps: ...@@ -13,7 +13,9 @@ steps:
# tests covered elsewhere. # tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that # Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" # However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test - label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 30 timeout_in_minutes: 30
......
group: Tool use
depends_on:
- image-build
steps:
- label: OpenAI-Compatible Tool Use
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
...@@ -235,6 +235,20 @@ pull_request_rules: ...@@ -235,6 +235,20 @@ pull_request_rules:
add: add:
- rocm - rocm
- name: label-cpu
description: Automatically apply cpu label
conditions:
- label != stale
- files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
actions:
label:
add:
- cpu
assign:
users:
- "fadara01"
- "aditew01"
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
...@@ -335,6 +349,18 @@ pull_request_rules: ...@@ -335,6 +349,18 @@ pull_request_rules:
add: add:
- tool-calling - tool-calling
- name: auto-rebase if approved, ready, and 40 commits behind main
conditions:
- base = main
- label=ready
- "#approved-reviews-by >= 1"
- "#commits-behind >= 40"
- -closed
- -draft
- -conflict
actions:
rebase: {}
- name: ping author on conflicts and add 'needs-rebase' label - name: ping author on conflicts and add 'needs-rebase' label
conditions: conditions:
- label != stale - label != stale
......
...@@ -56,8 +56,8 @@ endif() ...@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
...@@ -358,6 +358,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -358,6 +358,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# marlin arches for fp16 output # marlin arches for fp16 output
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# marlin has limited support for turing
cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
# marlin arches for fp8 input # marlin arches for fp8 input
...@@ -365,8 +367,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -365,8 +367,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
# marlin arches for other files
cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_OTHER_ARCHS)
# #
# For the Marlin kernels we automatically generate sources for various # For the Marlin kernels we automatically generate sources for various
...@@ -407,6 +411,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -407,6 +411,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin generation script has not changed, skipping generation.") message(STATUS "Marlin generation script has not changed, skipping generation.")
endif() endif()
if (MARLIN_ARCHS)
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu") file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
...@@ -426,6 +431,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -426,6 +431,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC}) list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
endif()
if (MARLIN_SM75_ARCHS)
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
endif()
if (MARLIN_FP8_ARCHS) if (MARLIN_FP8_ARCHS)
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu") file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
...@@ -447,14 +465,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -447,14 +465,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu") "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}" SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}") CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" set_source_files_properties(${MARLIN_SRCS}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
else() else()
message(STATUS "Not building Marlin kernels as no compatible archs found" message(STATUS "Not building Marlin kernels as no compatible archs found"
" in CUDA target architectures") " in CUDA target architectures")
...@@ -981,12 +999,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -981,12 +999,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# note that we always set `use_atomic_add=False` for moe marlin now, # note that we always set `use_atomic_add=False` for moe marlin now,
# so we don't need 9.0 for bf16 atomicAdd PTX # so we don't need 9.0 for bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# moe marlin has limited support for turing
cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
# moe marlin arches for fp8 input # moe marlin arches for fp8 input
# - sm80 doesn't support fp8 computation # - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) # moe marlin arches for other files
cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_OTHER_ARCHS)
# #
# For the Marlin MOE kernels we automatically generate sources for various # For the Marlin MOE kernels we automatically generate sources for various
...@@ -1027,8 +1049,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -1027,8 +1049,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.") message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif() endif()
if (MARLIN_MOE_ARCHS)
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu") file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}" SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
...@@ -1037,6 +1059,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -1037,6 +1059,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC}) list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
endif()
if (MARLIN_MOE_SM75_ARCHS)
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SM75_SRC}"
CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_MOE_SM75_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
endif()
if (MARLIN_MOE_FP8_ARCHS) if (MARLIN_MOE_FP8_ARCHS)
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu") file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
...@@ -1050,7 +1085,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -1050,7 +1085,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC}) list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
endif() endif()
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_OTHER_SRC}"
CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
else() else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures") " in CUDA target architectures")
......
...@@ -13,8 +13,8 @@ from vllm.triton_utils import triton ...@@ -13,8 +13,8 @@ from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128] batch_size_range = [1, 16, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] seq_len_range = [1, 16, 64, 1024, 4096]
intermediate_size = [3072, 9728, 12288] intermediate_size = [3072, 9728, 12288]
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
......
...@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON ...@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
PUBLIC ${oneDNN_BINARY_DIR}/include PUBLIC ${oneDNN_BINARY_DIR}/include
PRIVATE ${oneDNN_SOURCE_DIR}/src PRIVATE ${oneDNN_SOURCE_DIR}/src
) )
target_link_libraries(dnnl_ext dnnl) target_link_libraries(dnnl_ext dnnl torch)
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
list(APPEND LIBS dnnl_ext) list(APPEND LIBS dnnl_ext)
set(USE_ONEDNN ON) set(USE_ONEDNN ON)
...@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC ...@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
"csrc/cpu/cpu_attn.cpp" "csrc/cpu/cpu_attn.cpp"
"csrc/cpu/scratchpad_manager.cpp"
"csrc/cpu/torch_bindings.cpp") "csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp" "csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp" "csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp"
${VLLM_EXT_SRC}) ${VLLM_EXT_SRC})
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
......
...@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") ...@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# sm90a # sm90a
set(SUPPORT_ARCHS) set(SUPPORT_ARCHS)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
list(APPEND SUPPORT_ARCHS 9.0a) list(APPEND SUPPORT_ARCHS "9.0a")
endif() endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
list(APPEND SUPPORT_ARCHS 10.0a) # CUDA 12.9 has introduced "Family-Specific Architecture Features"
# this supports all compute_10x family
list(APPEND SUPPORT_ARCHS "10.0f")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
list(APPEND SUPPORT_ARCHS "10.0a")
endif() endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}") cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS) if(FLASH_MLA_ARCHS)
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS}) set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math") list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
...@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS) ...@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API> $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>) $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
else() else()
# Create empty targets for setup.py when not targeting sm90a systems message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
# Create empty targets for setup.py on unsupported systems
add_custom_target(_flashmla_C) add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C) add_custom_target(_flashmla_extension_C)
endif() endif()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment