Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
...@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg ...@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -23,16 +23,24 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" ...@@ -23,16 +23,24 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language \ pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \ --ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command. # commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
...@@ -39,7 +40,7 @@ steps: ...@@ -39,7 +40,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports) # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html - grep \"sig sig-object py\" build/html/dev/sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 15min - label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -63,13 +64,21 @@ steps: ...@@ -63,13 +64,21 @@ steps:
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness - tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
commands: commands:
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -81,7 +90,7 @@ steps: ...@@ -81,7 +90,7 @@ steps:
commands: commands:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test # 20min - label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -89,13 +98,13 @@ steps: ...@@ -89,13 +98,13 @@ steps:
- vllm/ - vllm/
commands: commands:
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
...@@ -108,7 +117,9 @@ steps: ...@@ -108,7 +117,9 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed - tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands: commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
...@@ -136,7 +147,9 @@ steps: ...@@ -136,7 +147,9 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
command: pytest -v -s test_regression.py commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
...@@ -150,7 +163,7 @@ steps: ...@@ -150,7 +163,7 @@ steps:
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: Examples Test # 12min - label: Examples Test # 15min
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
...@@ -167,8 +180,9 @@ steps: ...@@ -167,8 +180,9 @@ steps:
- python3 offline_inference_vision_language_multi_image.py - python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference_encoder_decoder.py
- python3 offline_profile.py --model facebook/opt-125m
- label: Prefix Caching Test # 7min - label: Prefix Caching Test # 9min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -176,7 +190,7 @@ steps: ...@@ -176,7 +190,7 @@ steps:
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 18min - label: Samplers Test # 36min
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
...@@ -192,17 +206,15 @@ steps: ...@@ -192,17 +206,15 @@ steps:
- tests/test_logits_processor - tests/test_logits_processor
command: pytest -v -s test_logits_processor.py command: pytest -v -s test_logits_processor.py
- label: Speculative decoding tests # 22min - label: Speculative decoding tests # 30min
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
commands: commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
...@@ -210,22 +222,24 @@ steps: ...@@ -210,22 +222,24 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4 parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" - label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph_smoke.py - pytest -v -s compile/test_basic_correctness.py
- label: "PyTorch Fullgraph Test" # TODO: re-write in comparison tests, and fix symbolic shape
source_file_dependencies: # for quantization ops.
- vllm/ # - label: "PyTorch Fullgraph Test" # 18min
- tests/compile # source_file_dependencies:
commands: # - vllm/
- pytest -v -s compile/test_full_graph.py # - tests/compile
# commands:
# - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 30min each - label: Kernels Test %N # 1h each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
...@@ -255,12 +269,12 @@ steps: ...@@ -255,12 +269,12 @@ steps:
- pip install aiohttp - pip install aiohttp
- bash run-benchmarks.sh - bash run-benchmarks.sh
- label: Quantization Test # 15min - label: Quantization Test # 33min
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
command: pytest -v -s quantization command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
...@@ -268,7 +282,6 @@ steps: ...@@ -268,7 +282,6 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
...@@ -299,7 +312,7 @@ steps: ...@@ -299,7 +312,7 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
- label: Decoder-only Language Models Test # 1h3min - label: Decoder-only Language Models Test # 1h36min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -307,7 +320,7 @@ steps: ...@@ -307,7 +320,7 @@ steps:
commands: commands:
- pytest -v -s models/decoder_only/language - pytest -v -s models/decoder_only/language
- label: Decoder-only Multi-Modal Models Test # 56min - label: Decoder-only Multi-Modal Models Test # 1h31min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -317,15 +330,28 @@ steps: ...@@ -317,15 +330,28 @@ steps:
- pytest -v -s models/decoder_only/audio_language - pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language - pytest -v -s models/decoder_only/vision_language
- label: Other Models Test # 5min - label: Other Models Test # 6min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/embedding/language - tests/models/embedding/language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/language - tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pytest -v -s models/embedding/language - pytest -v -s models/embedding/language
- pytest -v -s models/embedding/vision_language
- pytest -v -s models/encoder_decoder/language - pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -358,7 +384,7 @@ steps: ...@@ -358,7 +384,7 @@ steps:
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 28min - label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
...@@ -370,19 +396,21 @@ steps: ...@@ -370,19 +396,21 @@ steps:
- tests/distributed/ - tests/distributed/
- vllm/compilation - vllm/compilation
commands: commands:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Multi-step Tests (4 GPUs) # 21min - label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -400,7 +428,7 @@ steps: ...@@ -400,7 +428,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 23min - label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -426,7 +454,7 @@ steps: ...@@ -426,7 +454,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py - pytest -v -s -x lora/test_long_context.py
- label: Weight Loading Multiple GPU Test - label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
...@@ -459,7 +487,7 @@ steps: ...@@ -459,7 +487,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
...@@ -470,6 +498,5 @@ steps: ...@@ -470,6 +498,5 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4 - bash ./run-tests.sh -c configs/models-large.txt -t 4
vllm/*.so
/.venv /.venv
/build /build
dist dist
vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
CMakeLists.txt @tlrmchlsmth @WoosukKwon
# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-neuralmagic
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/multi_step @alexm-neuralmagic @comaniac
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
version: 2
updates:
# Maintain dependencies for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
name: Lint GitHub Actions workflows
on:
push:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
with:
fetch-depth: 0
- name: "Run actionlint"
run: |
tools/actionlint.sh -color
...@@ -8,7 +8,7 @@ jobs: ...@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Add label - name: Add label
uses: actions/github-script@v5 uses: actions/github-script@v7
with: with:
script: | script: |
github.rest.issues.addLabels({ github.rest.issues.addLabels({
......
...@@ -17,9 +17,9 @@ jobs: ...@@ -17,9 +17,9 @@ jobs:
matrix: matrix:
python-version: ["3.11"] python-version: ["3.11"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
......
{
"problemMatcher": [
{
"owner": "actionlint",
"pattern": [
{
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
"file": 1,
"line": 2,
"column": 3,
"message": 4,
"code": 5
}
]
}
]
}
...@@ -11,15 +11,15 @@ on: ...@@ -11,15 +11,15 @@ on:
- main - main
jobs: jobs:
ruff: mypy:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
...@@ -32,15 +32,4 @@ jobs: ...@@ -32,15 +32,4 @@ jobs:
pip install types-setuptools pip install types-setuptools
- name: Mypy - name: Mypy
run: | run: |
mypy tools/mypy.sh
mypy tests --follow-imports skip
mypy vllm/attention --follow-imports skip
mypy vllm/distributed --follow-imports skip
mypy vllm/engine --follow-imports skip
mypy vllm/executor --follow-imports skip
mypy vllm/lora --follow-imports skip
mypy vllm/model_executor --follow-imports skip
mypy vllm/prompt_adapter --follow-imports skip
mypy vllm/spec_decode --follow-imports skip
mypy vllm/worker --follow-imports skip
...@@ -21,16 +21,16 @@ jobs: ...@@ -21,16 +21,16 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }} upload_url: ${{ steps.create_release.outputs.upload_url }}
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Extract branch info - name: Extract branch info
shell: bash shell: bash
run: | run: |
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
- name: Create Release - name: Create Release
id: create_release id: create_release
uses: "actions/github-script@v6" uses: "actions/github-script@v7"
env: env:
RELEASE_TAG: ${{ env.release_tag }} RELEASE_TAG: ${{ env.release_tag }}
with: with:
...@@ -54,7 +54,7 @@ jobs: ...@@ -54,7 +54,7 @@ jobs:
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Setup ccache - name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2 uses: hendrikmuhs/ccache-action@v1.2
...@@ -68,7 +68,7 @@ jobs: ...@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh bash -x .github/workflows/scripts/env.sh
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
...@@ -86,10 +86,10 @@ jobs: ...@@ -86,10 +86,10 @@ jobs:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: | run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename) wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"} asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> $GITHUB_ENV echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
- name: Upload Release Asset - name: Upload Release Asset
uses: actions/upload-release-asset@v1 uses: actions/upload-release-asset@v1
......
...@@ -8,7 +8,7 @@ jobs: ...@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Remind to run full CI on PR - name: Remind to run full CI on PR
uses: actions/github-script@v6 uses: actions/github-script@v7
with: with:
script: | script: |
github.rest.issues.createComment({ github.rest.issues.createComment({
......
...@@ -17,9 +17,9 @@ jobs: ...@@ -17,9 +17,9 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
......
#!/bin/bash #!/bin/bash
set -eux
python_executable=python$1 python_executable=python$1
cuda_home=/usr/local/cuda-$2 cuda_home=/usr/local/cuda-$2
...@@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH ...@@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements # Install requirements
$python_executable -m pip install wheel packaging $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM # Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1 export MAX_JOBS=1
# Make sure release wheels are built for the following architectures # Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh
# Build # Build
$python_executable setup.py bdist_wheel --dist-dir=dist $python_executable setup.py bdist_wheel --dist-dir=dist
...@@ -16,9 +16,9 @@ jobs: ...@@ -16,9 +16,9 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
......
...@@ -33,6 +33,7 @@ share/python-wheels/ ...@@ -33,6 +33,7 @@ share/python-wheels/
.installed.cfg .installed.cfg
*.egg *.egg
MANIFEST MANIFEST
/.deps/
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template
...@@ -198,3 +199,6 @@ hip_compat.h ...@@ -198,3 +199,6 @@ hip_compat.h
# Benchmark dataset # Benchmark dataset
benchmarks/*.json benchmarks/*.json
# Linting
actionlint
...@@ -13,10 +13,10 @@ sphinx: ...@@ -13,10 +13,10 @@ sphinx:
fail_on_warning: true fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF # If using Sphinx, optionally build your docs in additional formats such as PDF
formats: formats: []
- pdf
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
install: install:
- requirements: docs/requirements-docs.txt - requirements: docs/requirements-docs.txt
...@@ -147,14 +147,32 @@ else() ...@@ -147,14 +147,32 @@ else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.") message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif() endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by if(VLLM_GPU_LANG STREQUAL "CUDA")
# the supported versions for the current language. #
# The final set of arches is stored in `VLLM_GPU_ARCHES`. # For cuda we want to be able to control which architectures we compile for on
# # a per-file basis in order to cut down on compile time. So here we extract
override_gpu_arches(VLLM_GPU_ARCHES # the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG} ${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()
# #
# Query torch for additional GPU compilation flags for the given # Query torch for additional GPU compilation flags for the given
...@@ -170,7 +188,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -170,7 +188,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
#
include(FetchContent) include(FetchContent)
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# #
# Define other extension targets # Define other extension targets
...@@ -224,30 +251,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -224,30 +251,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
"csrc/quantization/gguf/gguf_kernel.cu" set_gencode_flags_for_srcs(
"csrc/quantization/fp8/fp8_marlin.cu" SRCS "${MARLIN_SRCS}"
"csrc/custom_all_reduce.cu" CUDA_ARCHS "${MARLIN_ARCHS}")
"csrc/permute_cols.cu" list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" else()
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") message(STATUS "Not building Marlin kernels as no compatible archs found"
"in CUDA target architectures")
endif()
# #
# The CUTLASS kernels for Hopper require sm90a to be enabled. # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
# That adds an extra 17MB to compiled binary, so instead we selectively enable it. cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set_source_files_properties( set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" set_gencode_flags_for_srcs(
PROPERTIES SRCS "${SRCS}"
COMPILE_FLAGS CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
"-gencode arch=compute_90a,code=sm_90a") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
else()
if (SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
" for and covered by scaled_mm_c3x")
else()
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
"in CUDA target architectures")
endif()
endif() endif()
...@@ -255,15 +341,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -255,15 +341,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Machete kernels # Machete kernels
# The machete kernels only work on hopper and require CUDA 12.0 or later. # The machete kernels only work on hopper and require CUDA 12.0 or later.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) # Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# For the Machete kernels we automatically generate sources for various # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
# Generate sources: # Generate sources:
set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
...@@ -276,26 +373,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -276,26 +373,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else() else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.") message(STATUS "Machete generation completed successfully.")
endif() endif()
else()
message(STATUS "Machete generation script has not changed, skipping generation.")
endif()
# Add machete generated sources # Add machete generated sources
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
set_source_files_properties( # forward compatible
${MACHETE_GEN_SOURCES} set_gencode_flags_for_srcs(
PROPERTIES SRCS "${MACHETE_GEN_SOURCES}"
COMPILE_FLAGS CUDA_ARCHS "${MACHETE_ARCHS}")
"-gencode arch=compute_90a,code=sm_90a")
endif()
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
# raise an error if the user that this was built with an incompatible
# CUDA version)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu) csrc/quantization/machete/machete_pytorch.cu)
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running w4a16 quantized models on "
"Hopper.")
else()
message(STATUS "Not building Machete kernels as no compatible archs "
"found in CUDA target architectures")
endif()
endif()
# if CUDA endif
endif() endif()
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
...@@ -324,14 +435,33 @@ set(VLLM_MOE_EXT_SRC ...@@ -324,14 +435,33 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h" "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu") "csrc/moe/marlin_moe_ops.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
"in CUDA target architectures")
endif()
endif() endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")
...@@ -371,6 +501,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") ...@@ -371,6 +501,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return() return()
endif () endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
# #
# Build vLLM flash attention from source # Build vLLM flash attention from source
# #
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment