Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a810671a
Commit
a810671a
authored
Jan 08, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori
parents
86b5aefe
6a09612b
Changes
291
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
208 additions
and
124 deletions
+208
-124
.buildkite/scripts/hardware_ci/run-amd-test.sh
.buildkite/scripts/hardware_ci/run-amd-test.sh
+0
-1
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+1
-0
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-1
.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
...heduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+1
-1
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
...s/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+1
-1
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+27
-20
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+28
-19
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+1
-1
.buildkite/test_areas/e2e_integration.yaml
.buildkite/test_areas/e2e_integration.yaml
+1
-18
.buildkite/test_areas/entrypoints.yaml
.buildkite/test_areas/entrypoints.yaml
+19
-4
.buildkite/test_areas/lm_eval.yaml
.buildkite/test_areas/lm_eval.yaml
+2
-2
.buildkite/test_areas/lora.yaml
.buildkite/test_areas/lora.yaml
+2
-0
.buildkite/test_areas/models_basic.yaml
.buildkite/test_areas/models_basic.yaml
+2
-0
.buildkite/test_areas/pytorch.yaml
.buildkite/test_areas/pytorch.yaml
+3
-1
.buildkite/test_areas/tool_use.yaml
.buildkite/test_areas/tool_use.yaml
+0
-13
.github/mergify.yml
.github/mergify.yml
+26
-0
CMakeLists.txt
CMakeLists.txt
+78
-33
benchmarks/kernels/benchmark_activation.py
benchmarks/kernels/benchmark_activation.py
+2
-2
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+2
-2
cmake/external_projects/flashmla.cmake
cmake/external_projects/flashmla.cmake
+11
-5
No files found.
.buildkite/scripts/hardware_ci/run-amd-test.sh
View file @
a810671a
...
...
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
--ignore=entrypoints/openai/test_audio.py
\
--ignore=entrypoints/openai/test_shutdown.py
\
--ignore=entrypoints/openai/test_completion.py
\
--ignore=entrypoints/openai/test_sleep.py
\
--ignore=entrypoints/openai/test_models.py
\
--ignore=entrypoints/openai/test_lora_adapters.py
\
--ignore=entrypoints/openai/test_return_tokens_as_ids.py
\
...
...
.buildkite/scripts/hardware_ci/run-cpu-test.sh
View file @
a810671a
...
...
@@ -50,6 +50,7 @@ function cpu_tests() {
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
...
...
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
a810671a
...
...
@@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
--attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
...
...
.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
View file @
a810671a
...
...
@@ -44,10 +44,10 @@ trap cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--enable-eplb
\
--all2all-backend
$BACK
\
--eplb-config
'{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}'
\
--tensor-parallel-size
${
TENSOR_PARALLEL_SIZE
}
\
--data-parallel-size
${
DATA_PARALLEL_SIZE
}
\
...
...
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
View file @
a810671a
...
...
@@ -43,12 +43,12 @@ trap cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
4
\
--enable-expert-parallel
\
--enable-eplb
\
--all2all-backend
$BACK
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--speculative-config
'{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
\
--trust-remote-code
\
...
...
.buildkite/test-amd.yaml
View file @
a810671a
...
...
@@ -128,7 +128,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration Test (LLM)
# 30min
timeout_in_minutes
:
40
...
...
@@ -148,7 +148,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration Test (API Server)
# 100min
-
label
:
Entrypoints Integration Test (API Server
1
)
# 100min
timeout_in_minutes
:
130
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
...
...
@@ -162,10 +162,28 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration Test (API Server 2)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/sleep
-
tests/entrypoints/rpc
-
tests/tool_use
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/sleep
-
pytest -v -s tool_use
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
label
:
Entrypoints Integration Test (Pooling)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -722,7 +740,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.1
3.0
-
uv pip install --system torchao==0.1
4.1
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...
...
@@ -736,7 +754,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
OpenAI API correctness
# 10min
timeout_in_minutes
:
15
...
...
@@ -751,17 +769,6 @@ steps:
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-
pytest -s entrypoints/openai/correctness/
-
label
:
OpenAI-Compatible Tool Use
# 23 min
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
##### models test #####
...
...
@@ -1196,7 +1203,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
...
...
@@ -1490,7 +1497,7 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
HIP_VISIBLE_DEVICES=0,1
VLLM_ALL2ALL_BACKEND=deepep_high_throughput
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1
--dp-size=2 --max-model-len
2048
-
HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
--all2all-backend deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
...
...
@@ -1514,7 +1521,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
LM Eval Large Models (4 Card)
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
.buildkite/test-pipeline.yaml
View file @
a810671a
...
...
@@ -114,7 +114,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration Test (LLM)
# 30min
timeout_in_minutes
:
40
...
...
@@ -132,7 +132,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration Test (API Server)
# 100min
-
label
:
Entrypoints Integration Test (API Server
1
)
# 100min
timeout_in_minutes
:
130
mirror_hardwares
:
[
amdexperimental
]
working_dir
:
"
/vllm-workspace/tests"
...
...
@@ -144,10 +144,26 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration Test (API Server 2)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/sleep
-
tests/entrypoints/rpc
-
tests/tool_use
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/sleep
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
pytest -v -s tool_use
-
label
:
Entrypoints Integration Test (Pooling)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -303,7 +319,10 @@ steps:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-
pytest -v -s v1/e2e
-
pytest -v -s v1/engine
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
-
pytest -v -s v1/engine/test_preprocess_error_handling.py
-
pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-
label
:
V1 Test entrypoints
# 35min
timeout_in_minutes
:
50
...
...
@@ -642,7 +661,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.1
3.0
--index-url https://download.pytorch.org/whl/cu129
-
uv pip install --system torchao==0.1
4.1
--index-url https://download.pytorch.org/whl/cu129
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...
...
@@ -654,7 +673,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
OpenAI API correctness
# 22min
timeout_in_minutes
:
30
...
...
@@ -666,16 +685,6 @@ steps:
commands
:
# LMEval+Transcription WER check
-
pytest -s entrypoints/openai/correctness/
-
label
:
OpenAI-Compatible Tool Use
# 23 min
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
]
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
##### models test #####
-
label
:
Basic Models Tests (Initialization)
...
...
@@ -1064,7 +1073,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
...
...
@@ -1325,7 +1334,7 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2
VLLM_ALL2ALL_BACKEND=deepep_high_throughput
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1
--dp-size=2 --max-model-len
2048
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
--all2all-backend deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
...
...
.buildkite/test_areas/distributed.yaml
View file @
a810671a
...
...
@@ -145,7 +145,7 @@ steps:
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2
VLLM_ALL2ALL_BACKEND=deepep_high_throughput
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1
--dp-size=2 --max-model-len
2048
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
--all2all-backend deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
Distributed Tests (2 GPUs)(B200)
...
...
.buildkite/test_areas/e2e_integration.yaml
View file @
a810671a
...
...
@@ -32,6 +32,7 @@ steps:
-
label
:
Prime-RL Integration (2 GPUs)
timeout_in_minutes
:
30
optional
:
true
soft_fail
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
...
...
@@ -39,21 +40,3 @@ steps:
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
.buildkite/test_areas/entrypoints.yaml
View file @
a810671a
...
...
@@ -10,7 +10,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration (LLM)
timeout_in_minutes
:
40
...
...
@@ -25,7 +25,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration (API Server)
-
label
:
Entrypoints Integration (API Server
1
)
timeout_in_minutes
:
130
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
...
...
@@ -34,11 +34,26 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration (API Server 2)
timeout_in_minutes
:
130
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/tool_use
-
tests/entrypoints/sleep
-
tests/entrypoints/instrumentator
-
tests/entrypoints/rpc
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
pytest -v -s entrypoints/instrumentator
-
pytest -v -s entrypoints/sleep
-
pytest -v -s tool_use
-
label
:
Entrypoints Integration (Pooling)
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
...
...
.buildkite/test_areas/lm_eval.yaml
View file @
a810671a
...
...
@@ -9,7 +9,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
LM Eval Large Models (4 GPUs)(A100)
gpu
:
a100
...
...
@@ -43,4 +43,4 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
.buildkite/test_areas/lora.yaml
View file @
a810671a
...
...
@@ -22,6 +22,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Alot of these tests are on the edge of OOMing
-
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
-
pytest -v -s -x lora/test_chatglm3_tp.py
...
...
.buildkite/test_areas/models_basic.yaml
View file @
a810671a
...
...
@@ -9,6 +9,7 @@ steps:
source_file_dependencies
:
-
vllm/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Run a subset of model initialization tests
-
pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
...
...
@@ -20,6 +21,7 @@ steps:
source_file_dependencies
:
-
vllm/model_executor/models/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
...
...
.buildkite/test_areas/pytorch.yaml
View file @
a810671a
...
...
@@ -13,7 +13,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-exec
pytest
-s
-v
{}
\\
;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-print0
|
xargs
-0
-n1
-I{}
pytest
-s
-v
'{}'"
-
label
:
PyTorch Fullgraph Smoke Test
timeout_in_minutes
:
30
...
...
.buildkite/test_areas/tool_use.yaml
deleted
100644 → 0
View file @
86b5aefe
group
:
Tool use
depends_on
:
-
image-build
steps
:
-
label
:
OpenAI-Compatible Tool Use
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
]
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
.github/mergify.yml
View file @
a810671a
...
...
@@ -235,6 +235,20 @@ pull_request_rules:
add
:
-
rocm
-
name
:
label-cpu
description
:
Automatically apply cpu label
conditions
:
-
label != stale
-
files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
actions
:
label
:
add
:
-
cpu
assign
:
users
:
-
"
fadara01"
-
"
aditew01"
-
name
:
label-structured-output
description
:
Automatically apply structured-output label
conditions
:
...
...
@@ -335,6 +349,18 @@ pull_request_rules:
add
:
-
tool-calling
-
name
:
auto-rebase if approved, ready, and 40 commits behind main
conditions
:
-
base = main
-
label=ready
-
"
#approved-reviews-by
>=
1"
-
"
#commits-behind
>=
40"
-
-closed
-
-draft
-
-conflict
actions
:
rebase
:
{}
-
name
:
ping author on conflicts and add 'needs-rebase' label
conditions
:
-
label != stale
...
...
CMakeLists.txt
View file @
a810671a
...
...
@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set
(
TORCH_SUPPORTED_VERSION_CUDA
"2.9.
0
"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM
"2.9.
0
"
)
set
(
TORCH_SUPPORTED_VERSION_CUDA
"2.9.
1
"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM
"2.9.
1
"
)
#
# Try to find python package with an executable that exactly matches
...
...
@@ -358,6 +358,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# marlin arches for fp16 output
cuda_archs_loose_intersection
(
MARLIN_ARCHS
"8.0+PTX"
"
${
CUDA_ARCHS
}
"
)
# marlin has limited support for turing
cuda_archs_loose_intersection
(
MARLIN_SM75_ARCHS
"7.5"
"
${
CUDA_ARCHS
}
"
)
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
cuda_archs_loose_intersection
(
MARLIN_BF16_ARCHS
"8.0+PTX;9.0+PTX"
"
${
CUDA_ARCHS
}
"
)
# marlin arches for fp8 input
...
...
@@ -365,8 +367,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection
(
MARLIN_FP8_ARCHS
"8.9;12.0"
"
${
CUDA_ARCHS
}
"
)
# marlin arches for other files
cuda_archs_loose_intersection
(
MARLIN_OTHER_ARCHS
"7.5;8.0+PTX"
"
${
CUDA_ARCHS
}
"
)
if
(
MARLIN_ARCHS
)
if
(
MARLIN_
OTHER_
ARCHS
)
#
# For the Marlin kernels we automatically generate sources for various
...
...
@@ -407,25 +411,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message
(
STATUS
"Marlin generation script has not changed, skipping generation."
)
endif
()
file
(
GLOB MARLIN_TEMPLATE_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_TEMPLATE_KERNEL_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_TEMPLATE_KERNEL_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
if
(
MARLIN_ARCHS
)
file
(
GLOB MARLIN_TEMPLATE_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_TEMPLATE_KERNEL_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_TEMPLATE_KERNEL_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_KERNEL_SRC
}
)
file
(
GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_BF16_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
)
endif
()
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_KERNEL_SRC
}
)
file
(
GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_BF16_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
if
(
MARLIN_SM75_ARCHS
)
file
(
GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm75_kernel_*.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_TEMPLATE_SM75_KERNEL_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_SM75_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_TEMPLATE_SM75_KERNEL_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_SM75_KERNEL_SRC
}
)
endif
()
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_BF16_KERNEL_SRC
}
)
if
(
MARLIN_FP8_ARCHS
)
file
(
GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC
"csrc/quantization/gptq_marlin/sm89_kernel_*.cu"
)
...
...
@@ -447,14 +465,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_SRCS
}
"
CUDA_ARCHS
"
${
MARLIN_ARCHS
}
"
)
CUDA_ARCHS
"
${
MARLIN_
OTHER_
ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
set_source_files_properties
(
${
MARLIN_SRCS
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_EXT_SRC
"
${
MARLIN_SRCS
}
"
)
message
(
STATUS
"Building Marlin kernels for archs:
${
MARLIN_ARCHS
}
"
)
message
(
STATUS
"Building Marlin kernels for archs:
${
MARLIN_
OTHER_
ARCHS
}
"
)
else
()
message
(
STATUS
"Not building Marlin kernels as no compatible archs found"
" in CUDA target architectures"
)
...
...
@@ -981,12 +999,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# note that we always set `use_atomic_add=False` for moe marlin now,
# so we don't need 9.0 for bf16 atomicAdd PTX
cuda_archs_loose_intersection
(
MARLIN_MOE_ARCHS
"8.0+PTX"
"
${
CUDA_ARCHS
}
"
)
# moe marlin has limited support for turing
cuda_archs_loose_intersection
(
MARLIN_MOE_SM75_ARCHS
"7.5"
"
${
CUDA_ARCHS
}
"
)
# moe marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection
(
MARLIN_MOE_FP8_ARCHS
"8.9;12.0"
"
${
CUDA_ARCHS
}
"
)
if
(
MARLIN_MOE_ARCHS
)
# moe marlin arches for other files
cuda_archs_loose_intersection
(
MARLIN_MOE_OTHER_ARCHS
"7.5;8.0+PTX"
"
${
CUDA_ARCHS
}
"
)
if
(
MARLIN_MOE_OTHER_ARCHS
)
#
# For the Marlin MOE kernels we automatically generate sources for various
...
...
@@ -1027,16 +1049,29 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message
(
STATUS
"Marlin MOE generation script has not changed, skipping generation."
)
endif
()
file
(
GLOB MARLIN_MOE_SRC
"csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu"
)
list
(
APPEND MARLIN_MOE_SRC
"csrc/moe/marlin_moe_wna16/ops.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_MOE_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_MOE_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_MOE_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
if
(
MARLIN_MOE_ARCHS
)
file
(
GLOB MARLIN_MOE_SRC
"csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_MOE_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_MOE_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_MOE_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_MOE_EXT_SRC
${
MARLIN_MOE_SRC
}
)
endif
()
if
(
MARLIN_MOE_SM75_ARCHS
)
file
(
GLOB MARLIN_MOE_SM75_SRC
"csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_MOE_SM75_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_MOE_SM75_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_MOE_SM75_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_MOE_EXT_SRC
${
MARLIN_MOE_SM75_SRC
}
)
endif
()
list
(
APPEND VLLM_MOE_EXT_SRC
${
MARLIN_MOE_SRC
}
)
if
(
MARLIN_MOE_FP8_ARCHS
)
file
(
GLOB MARLIN_MOE_FP8_SRC
"csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu"
)
...
...
@@ -1050,7 +1085,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND VLLM_MOE_EXT_SRC
${
MARLIN_MOE_FP8_SRC
}
)
endif
()
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_ARCHS
}
"
)
set
(
MARLIN_MOE_OTHER_SRC
"csrc/moe/marlin_moe_wna16/ops.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_MOE_OTHER_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_MOE_OTHER_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
set_source_files_properties
(
${
MARLIN_MOE_OTHER_SRC
}
PROPERTIES COMPILE_FLAGS
"-static-global-template-stub=false"
)
endif
()
list
(
APPEND VLLM_MOE_EXT_SRC
"
${
MARLIN_MOE_OTHER_SRC
}
"
)
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_OTHER_ARCHS
}
"
)
else
()
message
(
STATUS
"Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures"
)
...
...
benchmarks/kernels/benchmark_activation.py
View file @
a810671a
...
...
@@ -13,8 +13,8 @@ from vllm.triton_utils import triton
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.utils.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
batch_size_range
=
[
1
,
16
,
32
,
64
,
128
]
seq_len_range
=
[
1
,
16
,
64
,
1
28
,
256
,
512
,
1024
,
2048
,
4096
]
batch_size_range
=
[
1
,
16
,
128
]
seq_len_range
=
[
1
,
16
,
64
,
1
024
,
4096
]
intermediate_size
=
[
3072
,
9728
,
12288
]
configs
=
list
(
itertools
.
product
(
batch_size_range
,
seq_len_range
,
intermediate_size
))
...
...
cmake/cpu_extension.cmake
View file @
a810671a
...
...
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
PUBLIC
${
oneDNN_BINARY_DIR
}
/include
PRIVATE
${
oneDNN_SOURCE_DIR
}
/src
)
target_link_libraries
(
dnnl_ext dnnl
)
target_link_libraries
(
dnnl_ext dnnl
torch
)
target_compile_options
(
dnnl_ext PRIVATE
${
CXX_COMPILE_FLAGS
}
-fPIC
)
list
(
APPEND LIBS dnnl_ext
)
set
(
USE_ONEDNN ON
)
...
...
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
"csrc/cpu/cpu_attn.cpp"
"csrc/cpu/scratchpad_manager.cpp"
"csrc/cpu/torch_bindings.cpp"
)
if
(
AVX512_FOUND AND NOT AVX512_DISABLED
)
set
(
VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp"
${
VLLM_EXT_SRC
}
)
if
(
ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI
)
set
(
VLLM_EXT_SRC
...
...
cmake/external_projects/flashmla.cmake
View file @
a810671a
...
...
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# sm90a
set
(
SUPPORT_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.3
)
list
(
APPEND SUPPORT_ARCHS 9.0a
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.3
)
list
(
APPEND SUPPORT_ARCHS
"
9.0a
"
)
endif
()
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8
)
list
(
APPEND SUPPORT_ARCHS 10.0a
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.9
)
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
# this supports all compute_10x family
list
(
APPEND SUPPORT_ARCHS
"10.0f"
)
elseif
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8
)
list
(
APPEND SUPPORT_ARCHS
"10.0a"
)
endif
()
cuda_archs_loose_intersection
(
FLASH_MLA_ARCHS
"
${
SUPPORT_ARCHS
}
"
"
${
CUDA_ARCHS
}
"
)
if
(
FLASH_MLA_ARCHS
)
message
(
STATUS
"FlashMLA CUDA architectures:
${
FLASH_MLA_ARCHS
}
"
)
set
(
VLLM_FLASHMLA_GPU_FLAGS
${
VLLM_GPU_FLAGS
}
)
list
(
APPEND VLLM_FLASHMLA_GPU_FLAGS
"--expt-relaxed-constexpr"
"--expt-extended-lambda"
"--use_fast_math"
)
...
...
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
)
else
()
# Create empty targets for setup.py when not targeting sm90a systems
message
(
STATUS
"FlashMLA will not compile: unsupported CUDA architecture
${
CUDA_ARCHS
}
"
)
# Create empty targets for setup.py on unsupported systems
add_custom_target
(
_flashmla_C
)
add_custom_target
(
_flashmla_extension_C
)
endif
()
...
...
Prev
1
2
3
4
5
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment