From 3dd6853bc8c4fb8bbaf507c1699e5cbe8fa356ad Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 12 Jun 2024 12:58:02 -0400 Subject: [PATCH 001/376] [CI/Build] Add `is_quant_method_supported` to control quantization test configurations (#5253) --- tests/models/test_aqlm.py | 13 ++----------- tests/models/test_fp8.py | 12 ++---------- tests/models/test_gptq_marlin.py | 13 ++----------- tests/models/test_gptq_marlin_24.py | 13 ++----------- tests/models/test_marlin.py | 13 ++----------- tests/quantization/test_bitsandbytes.py | 10 +++------- tests/quantization/test_fp8.py | 10 +++------- tests/quantization/utils.py | 14 ++++++++++++++ 8 files changed, 30 insertions(+), 68 deletions(-) create mode 100644 tests/quantization/utils.py diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index c4ecf846e..80034a511 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`. """ import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - -aqlm_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - aqlm_not_supported = (capability < - QUANTIZATION_METHODS["aqlm"].get_min_capability()) +from tests.quantization.utils import is_quant_method_supported # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency @@ -67,7 +58,7 @@ ground_truth_generations = [ ] -@pytest.mark.skipif(aqlm_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("aqlm"), reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 61aee0d0a..b24c17cf3 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -8,8 +8,8 @@ import pytest import torch from transformers import AutoTokenizer +from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -67,16 +67,8 @@ EXPECTED_STRS_MAP = { }, } -fp8_not_supported = True -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - fp8_not_supported = (capability < - QUANTIZATION_METHODS["fp8"].get_min_capability()) - - -@pytest.mark.skipif(fp8_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index e957450cc..e30100d9b 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`. import os import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from .utils import check_logprobs_close @@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 -gptq_marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - gptq_marlin_not_supported = ( - capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) - MODELS = [ # act_order==False, group_size=channelwise ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), @@ -53,7 +44,7 @@ MODELS = [ @pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(gptq_marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "bfloat16"]) diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 195c3e5b5..60d9ae2f1 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`. from dataclasses import dataclass import pytest -import torch from tests.models.utils import check_logprobs_close -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - -marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - marlin_not_supported = ( - capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) +from tests.quantization.utils import is_quant_method_supported @dataclass @@ -47,7 +38,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), reason="Marlin24 is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 761ba6aa4..e86f6e29d 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`. from dataclasses import dataclass import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from tests.quantization.utils import is_quant_method_supported from .utils import check_logprobs_close -marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - marlin_not_supported = ( - capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) - @dataclass class ModelPair: @@ -45,7 +36,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("marlin"), reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 31e938d15..953fd9ba9 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. import pytest import torch +from tests.quantization.utils import is_quant_method_supported from vllm import SamplingParams -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] - -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), - reason='bitsandbytes is not supported on this GPU type.') +@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), + reason='bitsandbytes is not supported on this GPU type.') def test_load_bnb_model(vllm_runner) -> None: with vllm_runner('huggyllama/llama-7b', quantization='bitsandbytes', diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index fccce7f7b..3db12f379 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_fp8.py --forked`. import pytest import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] - -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), - reason="FP8 is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("fp8"), + reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py new file mode 100644 index 000000000..0c92d565d --- /dev/null +++ b/tests/quantization/utils.py @@ -0,0 +1,14 @@ +import torch + +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + + +def is_quant_method_supported(quant_method: str) -> bool: + # Currently, all quantization methods require Nvidia or AMD GPUs + if not torch.cuda.is_available(): + return False + + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + return (capability < + QUANTIZATION_METHODS[quant_method].get_min_capability()) -- GitLab From e3c12bf6d22999cfbe267a7c788f6875340616cd Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 12 Jun 2024 12:03:24 -0500 Subject: [PATCH 002/376] Revert "[CI/Build] Add `is_quant_method_supported` to control quantization test configurations" (#5463) --- tests/models/test_aqlm.py | 13 +++++++++++-- tests/models/test_fp8.py | 12 ++++++++++-- tests/models/test_gptq_marlin.py | 13 +++++++++++-- tests/models/test_gptq_marlin_24.py | 13 +++++++++++-- tests/models/test_marlin.py | 13 +++++++++++-- tests/quantization/test_bitsandbytes.py | 10 +++++++--- tests/quantization/test_fp8.py | 10 +++++++--- tests/quantization/utils.py | 14 -------------- 8 files changed, 68 insertions(+), 30 deletions(-) delete mode 100644 tests/quantization/utils.py diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 80034a511..c4ecf846e 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -4,8 +4,17 @@ Run `pytest tests/models/test_aqlm.py`. """ import pytest +import torch -from tests.quantization.utils import is_quant_method_supported +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + +aqlm_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + aqlm_not_supported = (capability < + QUANTIZATION_METHODS["aqlm"].get_min_capability()) # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency @@ -58,7 +67,7 @@ ground_truth_generations = [ ] -@pytest.mark.skipif(not is_quant_method_supported("aqlm"), +@pytest.mark.skipif(aqlm_not_supported, reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index b24c17cf3..61aee0d0a 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -8,8 +8,8 @@ import pytest import torch from transformers import AutoTokenizer -from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -67,8 +67,16 @@ EXPECTED_STRS_MAP = { }, } +fp8_not_supported = True -@pytest.mark.skipif(not is_quant_method_supported("fp8"), +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + fp8_not_supported = (capability < + QUANTIZATION_METHODS["fp8"].get_min_capability()) + + +@pytest.mark.skipif(fp8_not_supported, reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index e30100d9b..e957450cc 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -11,8 +11,9 @@ Run `pytest tests/models/test_gptq_marlin.py`. import os import pytest +import torch -from tests.quantization.utils import is_quant_method_supported +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from .utils import check_logprobs_close @@ -21,6 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 +gptq_marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + gptq_marlin_not_supported = ( + capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) + MODELS = [ # act_order==False, group_size=channelwise ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), @@ -44,7 +53,7 @@ MODELS = [ @pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), +@pytest.mark.skipif(gptq_marlin_not_supported, reason="gptq_marlin is not supported on this GPU type.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "bfloat16"]) diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 60d9ae2f1..195c3e5b5 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -9,9 +9,18 @@ Run `pytest tests/models/test_marlin_24.py`. from dataclasses import dataclass import pytest +import torch from tests.models.utils import check_logprobs_close -from tests.quantization.utils import is_quant_method_supported +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass @@ -38,7 +47,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), +@pytest.mark.skipif(marlin_not_supported, reason="Marlin24 is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index e86f6e29d..761ba6aa4 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -13,11 +13,20 @@ Run `pytest tests/models/test_marlin.py`. from dataclasses import dataclass import pytest +import torch -from tests.quantization.utils import is_quant_method_supported +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from .utils import check_logprobs_close +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) + @dataclass class ModelPair: @@ -36,7 +45,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), +@pytest.mark.skipif(marlin_not_supported, reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 953fd9ba9..31e938d15 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -5,12 +5,16 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. import pytest import torch -from tests.quantization.utils import is_quant_method_supported from vllm import SamplingParams +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +capability = torch.cuda.get_device_capability() +capability = capability[0] * 10 + capability[1] -@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), - reason='bitsandbytes is not supported on this GPU type.') + +@pytest.mark.skipif( + capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), + reason='bitsandbytes is not supported on this GPU type.') def test_load_bnb_model(vllm_runner) -> None: with vllm_runner('huggyllama/llama-7b', quantization='bitsandbytes', diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 3db12f379..fccce7f7b 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,12 +5,16 @@ Run `pytest tests/quantization/test_fp8.py --forked`. import pytest import torch -from tests.quantization.utils import is_quant_method_supported +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod +capability = torch.cuda.get_device_capability() +capability = capability[0] * 10 + capability[1] -@pytest.mark.skipif(not is_quant_method_supported("fp8"), - reason="FP8 is not supported on this GPU type.") + +@pytest.mark.skipif( + capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), + reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py deleted file mode 100644 index 0c92d565d..000000000 --- a/tests/quantization/utils.py +++ /dev/null @@ -1,14 +0,0 @@ -import torch - -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - - -def is_quant_method_supported(quant_method: str) -> bool: - # Currently, all quantization methods require Nvidia or AMD GPUs - if not torch.cuda.is_available(): - return False - - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - return (capability < - QUANTIZATION_METHODS[quant_method].get_min_capability()) -- GitLab From 847cdcca1c94b12e6c118dbf863e4b111d1b4fd2 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Thu, 13 Jun 2024 02:06:14 +0900 Subject: [PATCH 003/376] [CI] Upgrade codespell version. (#5381) --- .github/workflows/ruff.yml | 2 +- requirements-dev.txt | 2 +- tests/core/test_chunked_prefill_scheduler.py | 2 +- tests/test_sharded_state_loader.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index e71033f82..773def58f 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2 + pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 - name: Analysing the code with ruff run: | ruff . diff --git a/requirements-dev.txt b/requirements-dev.txt index 12b22a61e..b380ef205 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ yapf==0.32.0 toml==0.10.2 tomli==2.0.1 ruff==0.1.5 -codespell==2.2.6 +codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 3649e6b00..f68482cc0 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -149,7 +149,7 @@ def test_complex(): # Only the first seq group has a new token appended. append_new_token(running[0], 1) - # Add 2 more requsets. + # Add 2 more requests. for i in range(2, 4): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) scheduler.add_seq_group(seq_group) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index de79c3b94..f5d956904 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -39,7 +39,7 @@ def test_filter_subtensors(): filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict) assert tuple(filtered_state_dict.keys()) == ("a", "b", "c") for key, tensor in filtered_state_dict.items(): - # NOTE: don't use `euqal` here, as the tensor might contain NaNs + # NOTE: don't use `equal` here, as the tensor might contain NaNs assert tensor is state_dict[key] -- GitLab From 1a8bfd92d5f35d638e3cfc8c4cd1779aeda0adfb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 12 Jun 2024 11:53:03 -0700 Subject: [PATCH 004/376] [Hardware] Initial TPU integration (#5292) --- Dockerfile.tpu | 19 + benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- .../getting_started/tpu-installation.rst | 75 +++ docs/source/index.rst | 3 +- requirements-tpu.txt | 7 + setup.py | 22 +- vllm/attention/backends/pallas.py | 232 ++++++++ vllm/attention/selector.py | 13 +- vllm/config.py | 6 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/async_llm_engine.py | 3 + vllm/engine/llm_engine.py | 3 + vllm/envs.py | 6 + vllm/executor/tpu_executor.py | 101 ++++ vllm/model_executor/custom_op.py | 4 +- .../model_executor/layers/rotary_embedding.py | 77 ++- vllm/model_executor/model_loader/loader.py | 27 +- vllm/utils.py | 14 + vllm/worker/cache_engine.py | 9 +- vllm/worker/tpu_model_runner.py | 525 ++++++++++++++++++ vllm/worker/tpu_worker.py | 198 +++++++ 22 files changed, 1322 insertions(+), 28 deletions(-) create mode 100644 Dockerfile.tpu create mode 100644 docs/source/getting_started/tpu-installation.rst create mode 100644 requirements-tpu.txt create mode 100644 vllm/attention/backends/pallas.py create mode 100644 vllm/executor/tpu_executor.py create mode 100644 vllm/worker/tpu_model_runner.py create mode 100644 vllm/worker/tpu_worker.py diff --git a/Dockerfile.tpu b/Dockerfile.tpu new file mode 100644 index 000000000..931c844c0 --- /dev/null +++ b/Dockerfile.tpu @@ -0,0 +1,19 @@ +ARG NIGHTLY_DATE="20240601" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" + +FROM $BASE_IMAGE + +WORKDIR /workspace +COPY . /workspace/vllm + +ENV VLLM_TARGET_DEVICE="tpu" +# Install aiohttp separately to avoid build errors. +RUN pip install aiohttp +# Install the TPU and Pallas dependencies. +RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html +RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + +# Build vLLM. +RUN cd /workspace/vllm && python setup.py develop + +CMD ["/bin/bash"] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1a41b66b3..17edb7515 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -189,7 +189,7 @@ if __name__ == '__main__': "--device", type=str, default="cuda", - choices=["cuda", "cpu"], + choices=["cuda", "cpu", "tpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument('--block-size', type=int, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 90f7433e0..07b2f8541 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -346,7 +346,7 @@ if __name__ == "__main__": "--device", type=str, default="cuda", - choices=["cuda", "cpu"], + choices=["cuda", "cpu", "tpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument( "--enable-prefix-caching", diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst new file mode 100644 index 000000000..3627600e1 --- /dev/null +++ b/docs/source/getting_started/tpu-installation.rst @@ -0,0 +1,75 @@ +.. _installation_tpu: + +Installation with TPU +===================== + +vLLM supports Google Cloud TPUs using PyTorch XLA. + +Requirements +------------ + +* Google Cloud TPU VM (single host) +* TPU versions: v5e, v5p, v4 +* Python: 3.10 + +Installation options: + +1. :ref:`Build a docker image with Dockerfile `. +2. :ref:`Build from source `. + +.. _build_docker_tpu: + +Build a docker image with :code:`Dockerfile.tpu` +------------------------------------------------ + +`Dockerfile.tpu `_ is provided to build a docker image with TPU support. + +.. code-block:: console + + $ docker build -f Dockerfile.tpu -t vllm-tpu . + + +You can run the docker image with the following command: + +.. code-block:: console + + $ # Make sure to add `--privileged --net host --shm-size=16G`. + $ docker run --privileged --net host --shm-size=16G -it vllm-tpu + + +.. _build_from_source_tpu: + +Build from source +----------------- + +You can also build and install the TPU backend from source. + +First, install the dependencies: + +.. code-block:: console + + $ # (Recommended) Create a new conda environment. + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + + $ # Clean up the existing torch and torch-xla packages. + $ pip uninstall torch torch-xla -y + + $ # Install PyTorch and PyTorch XLA. + $ export DATE="+20240601" + $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl + $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl + + $ # Install JAX and Pallas. + $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html + $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + + $ # Install other build dependencies. + $ pip install packaging aiohttp + + +Next, build vLLM from source. This will only take a few seconds: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE="tpu" python setup.py develop diff --git a/docs/source/index.rst b/docs/source/index.rst index 807251d02..b7c0d5b88 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -63,8 +63,9 @@ Documentation getting_started/installation getting_started/amd-installation - getting_started/neuron-installation getting_started/cpu-installation + getting_started/neuron-installation + getting_started/tpu-installation getting_started/quickstart getting_started/debugging getting_started/examples/examples_index diff --git a/requirements-tpu.txt b/requirements-tpu.txt new file mode 100644 index 000000000..22487f552 --- /dev/null +++ b/requirements-tpu.txt @@ -0,0 +1,7 @@ +# Common dependencies +-r requirements-common.txt + +# Dependencies for TPU +# Currently, the TPU backend uses a nightly version of PyTorch XLA. +# You can install the dependencies in Dockerfile.tpu. +triton # To avoid import errors diff --git a/setup.py b/setup.py index 53a697232..12e5c3456 100644 --- a/setup.py +++ b/setup.py @@ -206,9 +206,9 @@ class cmake_build_ext(build_ext): def _is_cuda() -> bool: - return VLLM_TARGET_DEVICE == "cuda" \ - and torch.version.cuda is not None \ - and not _is_neuron() + has_cuda = torch.version.cuda is not None + return (VLLM_TARGET_DEVICE == "cuda" and has_cuda + and not (_is_neuron() or _is_tpu())) def _is_hip() -> bool: @@ -225,10 +225,18 @@ def _is_neuron() -> bool: return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron" +def _is_tpu() -> bool: + return VLLM_TARGET_DEVICE == "tpu" + + def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" +def _build_custom_ops() -> bool: + return _is_cuda() or _is_hip() or _is_cpu() + + def _install_punica() -> bool: return envs.VLLM_INSTALL_PUNICA_KERNELS @@ -325,6 +333,8 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" + elif _is_tpu(): + version += "+tpu" elif _is_cpu(): version += "+cpu" else: @@ -372,6 +382,8 @@ def get_requirements() -> List[str]: requirements = _read_requirements("requirements-rocm.txt") elif _is_neuron(): requirements = _read_requirements("requirements-neuron.txt") + elif _is_tpu(): + requirements = _read_requirements("requirements-tpu.txt") elif _is_cpu(): requirements = _read_requirements("requirements-cpu.txt") else: @@ -385,7 +397,7 @@ ext_modules = [] if _is_cuda() or _is_hip(): ext_modules.append(CMakeExtension(name="vllm._moe_C")) -if not _is_neuron(): +if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) if _install_punica(): @@ -428,6 +440,6 @@ setup( extras_require={ "tensorizer": ["tensorizer>=2.9.0"], }, - cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, + cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {}, package_data=package_data, ) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py new file mode 100644 index 000000000..b203c5ec5 --- /dev/null +++ b/vllm/attention/backends/pallas.py @@ -0,0 +1,232 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch +import torch_xla.experimental.custom_kernel # Required to register custom ops. +import torch_xla.experimental.dynamo_set_buffer_donor + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) + + +class PallasAttentionBackend(AttentionBackend): + + @staticmethod + def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: + return PallasAttentionBackendImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "PallasMetadata": + return PallasMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (num_kv_heads, num_blocks, block_size, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + raise NotImplementedError("swap_blocks is not implemented.") + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + # TODO(woosuk): Implement this. + raise NotImplementedError("copy_blocks is not implemented.") + + +@dataclass +class PallasMetadata(AttentionMetadata): + + # Currently, input sequences can only contain all prefills + # or all decoding. + block_tables: Optional[torch.Tensor] + context_lens: Optional[torch.Tensor] + + @property + def prefill_metadata(self) -> Optional["PallasMetadata"]: + if self.num_prefills == 0: + return None + + assert self.num_decode_tokens == 0 + assert self.block_tables is None + assert self.context_lens is None + return self + + @property + def decode_metadata(self) -> Optional["PallasMetadata"]: + if self.num_decode_tokens == 0: + return None + + assert self.num_prefills == 0 + assert self.num_prefill_tokens == 0 + assert self.block_tables is not None + assert self.context_lens is not None + return self + + +class PallasAttentionBackendImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + if head_size % 128 != 0: + raise NotImplementedError("Head size must be a multiple of 128.") + if alibi_slopes is not None: + raise NotImplementedError("Alibi slopes is not supported.") + if sliding_window is not None: + raise NotImplementedError("Sliding window is not supported.") + if kv_cache_dtype != "auto": + raise NotImplementedError("FP8 KV cache dtype is not supported.") + if blocksparse_params is not None: + raise NotImplementedError("Blocksparse is not supported.") + + if torch_xla.tpu.version() < 4: + raise NotImplementedError("TPU version must be 4 or higher.") + + self.megacore_mode = None + tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower() + if not tpu_type.endswith("lite"): + if self.num_kv_heads % 2 == 0: + self.megacore_mode = "kv_head" + else: + # NOTE(woosuk): If the batch size is not a multiple of 2, the + # megacore mode will be None. + self.megacore_mode = "batch" + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]], + attn_metadata: PallasMetadata, + kv_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with Pallas attention. + + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + key_cache = [num_kv_heads, num_blocks, block_size, head_size] + value_cache = [num_kv_heads, num_blocks, block_size, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + """ + assert kv_scale == 1.0 + batch_size, seq_len, hidden_size = query.shape + query = query.view(batch_size, seq_len, self.num_heads, self.head_size) + key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) + value = value.view(batch_size, seq_len, self.num_kv_heads, + self.head_size) + + if kv_cache[0] is not None: + slot_mapping = attn_metadata.slot_mapping + key_cache, value_cache = kv_cache + write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping) + + query = query * self.scale + if attn_metadata.num_prefills > 0: + assert seq_len % 16 == 0, ( + "Pallas FlashAttention kernel requires seq_len to be a " + f"multiple of 16 but got {seq_len}") + + # Handle GQA/MQA. + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=-2) + key = key.view(batch_size, seq_len, self.num_heads, + self.head_size) + value = value.repeat_interleave(self.num_queries_per_kv, + dim=-2) + value = value.view(batch_size, seq_len, self.num_heads, + self.head_size) + # FlashAttention requires [batch_size, num_heads, seq_len, d_model] + # while the input is [batch_size, seq_len, num_heads, d_model]. + # Permute the input to match the required format. + output = torch.ops.xla.flash_attention( + query.permute(0, 2, 1, 3), + key.permute(0, 2, 1, 3), + value.permute(0, 2, 1, 3), + True, + ) + output = output.permute(0, 2, 1, 3) + else: + # Decoding run. + assert kv_cache is not None + + pages_per_compute_block = 16 # TODO(woosuk): Tune this value. + if self.megacore_mode == "batch" and batch_size % 2 != 0: + megacore_mode = None + else: + megacore_mode = self.megacore_mode + + # NOTE(woosuk): A temporary workaround to avoid the error: + # "xla::paged_attention() Expected a value of type 'str' for + # argument 'megacore_mode' but instead found type 'NoneType'." + if megacore_mode is not None: + output = torch.ops.xla.paged_attention( + query.squeeze(dim=1), + key_cache, + value_cache, + attn_metadata.context_lens, + attn_metadata.block_tables, + pages_per_compute_block, + megacore_mode=megacore_mode, + ) + else: + output = torch.ops.xla.paged_attention( + query.squeeze(dim=1), + key_cache, + value_cache, + attn_metadata.context_lens, + attn_metadata.block_tables, + pages_per_compute_block, + ) + + # Reshape the output tensor. + return output.reshape(batch_size, seq_len, hidden_size) + + +def write_to_kv_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, +) -> None: + torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True) + torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True) + + key = key.flatten(0, 2) + value = value.flatten(0, 2) + key_cache = key_cache.flatten(0, 2) + value_cache = value_cache.flatten(0, 2) + key_cache.index_copy_(0, slot_mapping, key) + value_cache.index_copy_(0, slot_mapping, value) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 7253483f9..3f0e29c73 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -7,7 +7,7 @@ import torch import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_cpu, is_hip +from vllm.utils import is_cpu, is_hip, is_tpu logger = init_logger(__name__) @@ -18,6 +18,7 @@ class _Backend(enum.Enum): ROCM_FLASH = enum.auto() TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() + PALLAS = enum.auto() @lru_cache(maxsize=None) @@ -66,6 +67,10 @@ def get_attn_backend( "Please make sure --enforce-eager is set.") from vllm.attention.backends.flashinfer import FlashInferBackend return FlashInferBackend + elif backend == _Backend.PALLAS: + logger.info("Using Pallas backend.") + from vllm.attention.backends.pallas import PallasAttentionBackend + return PallasAttentionBackend else: raise ValueError("Invalid attention backend.") @@ -80,7 +85,6 @@ def which_attn_to_use( block_size: int, ) -> _Backend: """Returns which flash attention backend to use.""" - # Default case. selected_backend = _Backend.FLASH_ATTN @@ -100,6 +104,11 @@ def which_attn_to_use( logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA + if is_tpu(): + if selected_backend != _Backend.PALLAS: + logger.info("Cannot use %s backend on TPU.", selected_backend) + return _Backend.PALLAS + if is_hip(): # AMD GPUs. selected_backend = (_Backend.ROCM_FLASH if selected_backend diff --git a/vllm/config.py b/vllm/config.py index 50b0156b1..2513d43ce 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,7 +11,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron +from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -748,6 +748,8 @@ class DeviceConfig: # Automated device type detection if is_neuron(): self.device_type = "neuron" + elif is_tpu(): + self.device_type = "tpu" elif is_cpu(): self.device_type = "cpu" else: @@ -761,6 +763,8 @@ class DeviceConfig: # Some device types require processing inputs on CPU if self.device_type in ["neuron"]: self.device = torch.device("cpu") + elif self.device_type in ["tpu"]: + self.device = None else: # Set device with device type self.device = torch.device(self.device_type) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd29db7d7..227de5475 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -504,7 +504,7 @@ class EngineArgs: parser.add_argument("--device", type=str, default=EngineArgs.device, - choices=["auto", "cuda", "neuron", "cpu"], + choices=["auto", "cuda", "neuron", "cpu", "tpu"], help='Device type for vLLM execution.') # Related to Vision-language models such as llava diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index aa1f07b5b..943402c86 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -375,6 +375,9 @@ class AsyncLLMEngine: if engine_config.device_config.device_type == "neuron": from vllm.executor.neuron_executor import NeuronExecutorAsync executor_class = NeuronExecutorAsync + elif engine_config.device_config.device_type == "tpu": + from vllm.executor.tpu_executor import TPUExecutorAsync + executor_class = TPUExecutorAsync elif engine_config.device_config.device_type == "cpu": assert distributed_executor_backend is None, ( "Distributed execution is not supported with the CPU backend.") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4f56bbd5c..ea7547584 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -341,6 +341,9 @@ class LLMEngine: if engine_config.device_config.device_type == "neuron": from vllm.executor.neuron_executor import NeuronExecutor executor_class = NeuronExecutor + elif engine_config.device_config.device_type == "tpu": + from vllm.executor.tpu_executor import TPUExecutor + executor_class = TPUExecutor elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor diff --git a/vllm/envs.py b/vllm/envs.py index f0513b9af..f03b69f4b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -217,6 +218,11 @@ environment_variables: Dict[str, Callable[[], Any]] = { # Default is 5 seconds "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), + + # Path to the XLA persistent cache directory. + # Only used for XLA devices such as TPUs. + "VLLM_XLA_CACHE_PATH": + lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"), } # end-env-vars-definition diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py new file mode 100644 index 000000000..7061ad85f --- /dev/null +++ b/vllm/executor/tpu_executor.py @@ -0,0 +1,101 @@ +from typing import List, Set, Tuple + +import torch + +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +logger = init_logger(__name__) + + +class TPUExecutor(ExecutorBase): + + def _init_executor(self) -> None: + assert not self.scheduler_config.chunked_prefill_enabled, ( + "Chunked prefill is not yet supported for TPU backend") + assert not self.speculative_config, ( + "Speculative decoding is not yet supported for TPU backend") + if self.model_config.dtype in (torch.float16, torch.float32): + logger.warning( + "The TPU backend currently does not support %s. " + "Using bfloat16 instead.", self.model_config.dtype) + self.model_config.dtype = torch.bfloat16 + + # Instantiate the worker and load the model to the device. + self._init_worker() + + def _init_worker(self): + from vllm.worker.tpu_worker import TPUWorker + + assert self.parallel_config.world_size == 1, ( + "TPUExecutor currently only supports a single TPU chip.") + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = TPUWorker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + self.cache_config, + self.load_config, + self.vision_language_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + + def initialize_cache( + self, + num_gpu_blocks: int, + num_cpu_blocks: int, + ) -> None: + """Initialize the KV cache by invoking the underlying worker.""" + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.driver_worker.determine_num_available_blocks() + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + output = self.driver_worker.execute_model(execute_model_req) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + + def list_loras(self) -> Set[int]: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + + def check_health(self) -> None: + # TPUExecutor will always be healthy as long as it's running. + return + + +class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + sexecute_model_req: ExecuteModelRequest, + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model + )(sexecute_model_req) + return output diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 1d49213cd..56aa629ae 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,6 @@ import torch.nn as nn -from vllm.utils import is_cpu, is_hip +from vllm.utils import is_cpu, is_hip, is_tpu class CustomOp(nn.Module): @@ -56,5 +56,7 @@ class CustomOp(nn.Module): return self.forward_hip elif is_cpu(): return self.forward_cpu + elif is_tpu(): + return self.forward_tpu else: return self.forward_cuda diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d2652106b..792c47293 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -28,6 +28,7 @@ import torch import torch.nn as nn from vllm.model_executor.custom_op import CustomOp +from vllm.utils import is_tpu def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -43,6 +44,19 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: return x.flatten(-2) +def _apply_rotary_emb( + x: torch.Tensor, + freqs_cis: torch.Tensor, +) -> torch.Tensor: + x_ = torch.view_as_complex( + torch.stack(torch.chunk(x.transpose(1, 2).float(), 2, dim=-1), dim=-1)) + x_out = torch.view_as_real(x_ * freqs_cis).type_as(x) + x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2) + x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2], + -1).transpose(1, 2) + return x_out + + class RotaryEmbedding(CustomOp): """Original rotary positional embedding.""" @@ -64,8 +78,14 @@ class RotaryEmbedding(CustomOp): self.dtype = dtype cache = self._compute_cos_sin_cache() - cache = cache.to(dtype) - self.register_buffer("cos_sin_cache", cache, persistent=False) + self.use_native2 = is_tpu() and is_neox_style + if not self.use_native2: + cache = cache.to(dtype) + self.register_buffer("cos_sin_cache", cache, persistent=False) + else: + cos, sin = cache.chunk(2, dim=-1) + freqs_cis = cos + 1j * sin + self.register_buffer("freqs_cis", freqs_cis, persistent=False) def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: """Compute the inverse frequency.""" @@ -100,7 +120,11 @@ class RotaryEmbedding(CustomOp): key: torch.Tensor, offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - """PyTorch-native implementation equivalent to forward().""" + """A PyTorch-native implementation equivalent to forward(). + + This method mimics the implementation of the custom CUDA kernel + used in `forward_cuda()`. + """ query = query.view(*query.shape[:-1], -1, self.head_size) key = key.view(*key.shape[:-1], -1, self.head_size) @@ -138,6 +162,42 @@ class RotaryEmbedding(CustomOp): key = key.flatten(-2) return query, key + def forward_native2( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Another PyTorch-native implementation of forward(). + + This method might perform better than `forward_native()` when compiled. + """ + if positions.dim() == 1: + batch_size = 1 + seq_len = positions.shape[0] + else: + batch_size, seq_len = positions.shape + if offsets is not None: + positions = positions + offsets + freqs_cis = self.freqs_cis.index_select(0, positions.flatten()) + freqs_cis = freqs_cis.view(batch_size, 1, seq_len, -1) + + query_shape = query.shape + query = query.view(batch_size, seq_len, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = _apply_rotary_emb(query_rot, freqs_cis) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(batch_size, seq_len, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = _apply_rotary_emb(key_rot, freqs_cis) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + def forward_cuda( self, positions: torch.Tensor, @@ -161,6 +221,17 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache, self.is_neox_style) return query, key + def forward_tpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + forward_fn = (self.forward_native2 + if self.use_native2 else self.forward_native) + return forward_fn(positions, query, key, offsets) + def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 9c2eaee2e..f4c3dcbac 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -34,6 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import ( pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import is_tpu logger = init_logger(__name__) @@ -227,12 +228,26 @@ class DefaultModelLoader(BaseModelLoader): if self.load_config.load_format == LoadFormat.NPCACHE: # Currently np_cache only support *.bin checkpoints assert use_safetensors is False - return np_cache_weights_iterator(model_name_or_path, - self.load_config.download_dir, - hf_folder, hf_weights_files) - if use_safetensors: - return safetensors_weights_iterator(hf_weights_files) - return pt_weights_iterator(hf_weights_files) + weights_iterator = np_cache_weights_iterator( + model_name_or_path, self.load_config.download_dir, hf_folder, + hf_weights_files) + elif use_safetensors: + weights_iterator = safetensors_weights_iterator(hf_weights_files) + else: + weights_iterator = pt_weights_iterator(hf_weights_files) + + if is_tpu(): + # In PyTorch XLA, we should call `xm.mark_step` frequently so that + # not too many ops are accumulated in the XLA program. + import torch_xla.core.xla_model as xm + + def _xla_weights_iterator(iterator: Generator): + for weights in iterator: + yield weights + xm.mark_step() + + weights_iterator = _xla_weights_iterator(weights_iterator) + return weights_iterator def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, diff --git a/vllm/utils.py b/vllm/utils.py index 54d446b23..af585929d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -146,6 +146,15 @@ def is_neuron() -> bool: return transformers_neuronx is not None +@lru_cache(maxsize=None) +def is_tpu() -> bool: + try: + import libtpu + except ImportError: + libtpu = None + return libtpu is not None + + @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -546,6 +555,11 @@ def maybe_expand_dim(tensor: torch.Tensor, return tensor +def get_dtype_size(dtype: torch.dtype) -> int: + """Get the size of the data type in bytes.""" + return torch.tensor([], dtype=dtype).element_size() + + def merge_dicts(dict1: Dict[Any, List[Any]], dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: """Merge 2 dicts that have key -> List of items. diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 2f0e59f7a..341b177d4 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,8 @@ import torch from vllm.attention import get_attn_backend from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, + is_pin_memory_available) logger = init_logger(__name__) @@ -108,9 +109,5 @@ class CacheEngine: dtype = model_config.dtype else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - dtype_size = _get_dtype_size(dtype) + dtype_size = get_dtype_size(dtype) return dtype_size * total - - -def _get_dtype_size(dtype: torch.dtype) -> int: - return torch.tensor([], dtype=dtype).element_size() diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py new file mode 100644 index 000000000..5003d3b0c --- /dev/null +++ b/vllm/worker/tpu_model_runner.py @@ -0,0 +1,525 @@ +import time +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch_xla.core.xla_model as xm + +from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, + SamplerOutput, SequenceGroupMetadata, + SequenceOutput) +from vllm.utils import make_tensor_with_pad + +logger = init_logger(__name__) + +_PAD_SLOT_ID = 0 # FIXME(woosuk) + + +class TPUModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + vision_language_config: Optional[VisionLanguageConfig] = None, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.vision_language_config = vision_language_config + + self.block_size = self.cache_config.block_size + self.max_num_blocks_per_seq = (self.model_config.max_model_len // + self.block_size) + self.block_tables = np.zeros( + (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq), + dtype=np.int32) + self.attn_backend = get_attn_backend( + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.cache_config.cache_dtype, + self.block_size, + False, + ) + + def load_model(self) -> None: + self.device = self.device_config.device + + model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + parallel_config=self.parallel_config, + cache_config=self.cache_config, + scheduler_config=self.scheduler_config, + vision_language_config=self.vision_language_config, + lora_config=None, + ) + xm.wait_device_ops() + + model = ModelWrapper(model) + self.model = torch.compile(model, backend="openxla", fullgraph=True) + + def _dummy_run( + self, + batch_size: int, + seq_len: int, + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + is_prompt: bool, + ) -> None: + if is_prompt: + seq_len = (seq_len + 15) // 16 * 16 + token_ids = torch.zeros((batch_size, seq_len), + dtype=torch.int32, + device=self.device) + position_ids = torch.zeros((batch_size, seq_len), + dtype=torch.int32, + device=self.device) + slot_mapping = torch.zeros((batch_size, seq_len), + dtype=torch.int64, + device=self.device) + attn_metadata = self.attn_backend.make_metadata( + num_prefills=batch_size, + num_prefill_tokens=batch_size * seq_len, + num_decode_tokens=0, + slot_mapping=slot_mapping, + block_tables=None, + context_lens=None, + ) + input_lens = torch.ones((batch_size, ), + dtype=torch.int32, + device=self.device) + else: + assert seq_len == 1 + token_ids = torch.zeros((batch_size, seq_len), + dtype=torch.int32, + device=self.device) + position_ids = torch.zeros((batch_size, seq_len), + dtype=torch.int32, + device=self.device) + slot_mapping = torch.zeros((batch_size, seq_len), + dtype=torch.int64, + device=self.device) + block_tables = torch.zeros( + (batch_size, self.max_num_blocks_per_seq), + dtype=torch.int32, + device=self.device) + context_lens = torch.ones((batch_size, ), + dtype=torch.int32, + device=self.device) + input_lens = torch.ones((batch_size, ), + dtype=torch.int32, + device=self.device) + attn_metadata = self.attn_backend.make_metadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=batch_size * seq_len, + slot_mapping=slot_mapping, + block_tables=block_tables, + context_lens=context_lens, + ) + t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) + p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) + + # Dummy run. + self.model(token_ids, position_ids, kv_caches, attn_metadata, + input_lens, t, p) + + def warmup_model( + self, + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> None: + # Prefill + logger.info("Compiling the model with different input shapes...") + start = time.time() + for batch_size in [1]: + seq_len = 16 + while True: + self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=True) + xm.wait_device_ops() + logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) + + if seq_len >= self.model_config.max_model_len: + break + num_tokens = batch_size * seq_len + if num_tokens >= self.scheduler_config.max_num_batched_tokens: + break + seq_len = seq_len * 2 + + end = time.time() + logger.info("Compilation for prefill done in %.2f s.", end - start) + + # Decode + start = time.time() + seq_len = 1 + batch_size = 1 + while True: + self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False) + xm.wait_device_ops() + logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) + + if batch_size >= self.scheduler_config.max_num_seqs: + break + batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2 + + end = time.time() + logger.info("Compilation for decode done in %.2f s.", end - start) + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ): + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + prompt_lens: List[int] = [] + slot_mapping: List[List[int]] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + # Could include output tokens when a request is preempted. + prompt_tokens = seq_data.get_token_ids() + prompt_len = len(prompt_tokens) + prompt_lens.append(prompt_len) + + input_tokens.append(prompt_tokens) + input_positions.append(list(range(prompt_len))) + + assert seq_group_metadata.block_tables is not None + block_table = seq_group_metadata.block_tables[seq_id] + slot_mapping.append([]) + for i in range(prompt_len): + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + assert len(prompt_lens) > 0 + num_prefills = len(prompt_lens) + num_prefill_tokens = sum(prompt_lens) + + # Add paddings to make the shape [batch_size, max_prompt_len] where + # max_prompt_len is smallest power of 2 that is greater than or equal + # to the maximum prompt length. + # We need the 2D input shape because the Pallas FlashAttention kernel + # does not support packed 1D inputs. + # We pad the seq_len to powers of 2 to reduce the compilation overhead. + max_prompt_len = _get_padded_prefill_len(max(prompt_lens)) + input_tokens = make_tensor_with_pad(input_tokens, + max_prompt_len, + pad=0, + dtype=torch.int32, + device=self.device) + input_positions = make_tensor_with_pad(input_positions, + max_prompt_len, + pad=0, + dtype=torch.int32, + device=self.device) + slot_mapping = make_tensor_with_pad(slot_mapping, + max_prompt_len, + pad=_PAD_SLOT_ID, + dtype=torch.int64, + device=self.device) + prompt_lens = torch.tensor(prompt_lens, + dtype=torch.int32, + device=self.device) + attn_metadata = self.attn_backend.make_metadata( + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, # NOTE: This is not used. + num_decode_tokens=0, + slot_mapping=slot_mapping, + block_tables=None, + context_lens=None, + ) + return input_tokens, input_positions, attn_metadata, prompt_lens + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ): + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + context_lens: List[int] = [] + num_seq_groups = len(seq_group_metadata_list) + batch_size = _get_padded_batch_size(num_seq_groups) + + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + assert not seq_group_metadata.is_prompt + + seq_ids = list(seq_group_metadata.seq_data.keys()) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append([position]) + context_lens.append(seq_len) + + assert seq_group_metadata.block_tables is not None + block_table = seq_group_metadata.block_tables[seq_id] + self.block_tables[i, :len(block_table)] = block_table + + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append([slot]) + + num_paddings = batch_size - num_seq_groups + input_tokens = input_tokens + [[0]] * num_paddings + input_positions = input_positions + [[0]] * num_paddings + slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings + context_lens = context_lens + [0] * num_paddings + + input_tokens = torch.tensor(input_tokens, + dtype=torch.int32, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.int32, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.int64, + device=self.device) + context_lens = torch.tensor(context_lens, + dtype=torch.int32, + device=self.device) + block_tables = torch.tensor(self.block_tables[:batch_size], + dtype=torch.int32, + device=self.device) + input_lens = torch.tensor([1] * batch_size, + dtype=torch.int32, + device=self.device) + attn_metadata = self.attn_backend.make_metadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=batch_size, + slot_mapping=slot_mapping, + block_tables=block_tables, + context_lens=context_lens, + ) + return input_tokens, input_positions, attn_metadata, input_lens + + def _prepare_sample( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + padded_batch_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + assert len(seq_group_metadata_list) > 0 + t = [] + p = [] + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.sampling_params is not None + sampling_params = seq_group_metadata.sampling_params + + t.append(sampling_params.temperature + if sampling_params.temperature >= 1e-5 else 1e-5) + p.append(sampling_params.top_p) + num_paddings = padded_batch_size - len(seq_group_metadata_list) + t += [1.0] * num_paddings + p += [1.0] * num_paddings + + t = torch.tensor(t, dtype=torch.float32, device=self.device) + p = torch.tensor(p, dtype=torch.float32, device=self.device) + return t, p + + def prepare_inputs( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ): + assert seq_group_metadata_list is not None + assert len(seq_group_metadata_list) > 0 + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + if seq_group_metadata_list[0].is_prompt: + inputs = self._prepare_prompt(seq_group_metadata_list) + else: + inputs = self._prepare_decode(seq_group_metadata_list) + padded_batch_size = inputs[0].shape[0] + sample_inputs = self._prepare_sample(seq_group_metadata_list, + padded_batch_size) + return inputs + sample_inputs + + def _execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> List[CompletionSequenceGroupOutput]: + inputs = self.prepare_inputs(seq_group_metadata_list) + next_token_ids = self.model(inputs[0], inputs[1], kv_caches, + *inputs[2:]) + next_token_ids = next_token_ids.cpu().tolist() + + i = 0 + sampler_outputs = [] + for seq_group_metadata in seq_group_metadata_list: + seq_outputs = [] + seq_ids = list(seq_group_metadata.seq_data.keys()) + for seq_id in seq_ids: + next_token_id = next_token_ids[i] + seq_outputs.append( + SequenceOutput(seq_id, next_token_id, + {next_token_id: Logprob(0.0)})) + i += 1 + sampler_outputs.append( + CompletionSequenceGroupOutput(seq_outputs, None)) + return sampler_outputs + + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> SamplerOutput: + assert seq_group_metadata_list is not None + if seq_group_metadata_list[0].is_prompt: + # NOTE(woosuk): To reduce the compilation time, we only compile the + # prefill inputs with batch size 1. Because the scheduler is not + # aware of this limitation, we need to handle batch size > 1 + # internally by calling the model multiple times and concatenating + # the outputs. + # FIXME(woosuk): This is a temporary hack to not change the existing + # scheduler. We need to fix this in the future. + sampler_outputs = [] + for seq_group_metadata in seq_group_metadata_list: + sampler_outputs += self._execute_model([seq_group_metadata], + kv_caches) + else: + sampler_outputs = self._execute_model(seq_group_metadata_list, + kv_caches) + return SamplerOutput(sampler_outputs) + + +class ModelWrapper(nn.Module): + + def __init__(self, model: nn.Module): + super().__init__() + self.model = model.eval() + + def forward( + self, + token_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]], + attn_metadata: AttentionMetadata, + input_lens: torch.Tensor, + t: torch.Tensor, + p: torch.Tensor, + ) -> torch.Tensor: + """Executes the forward pass of the model and samples the next token. + + Args: + token_ids: The input token IDs of shape [batch_size, seq_len]. + position_ids: The input position IDs of shape [batch_size, seq_len]. + kv_caches: The key and value caches. They can be None during the + memory profiling at initialization. + attn_metadata: The Pallas attention metadata. + input_lens: The actual input lengths of shape [batch_size]. + t: The sampling temperature of shape [batch_size]. + p: The top-p probability of shape [batch_size]. + """ + batch_size, seq_len = token_ids.shape + # Calculate the positions to sample from. + base_indicies = torch.arange( + batch_size, dtype=torch.int32, device=input_lens.device) * seq_len + logits_indices = base_indicies + input_lens - 1 + + # FIXME(woosuk): This is a temporary hack to avoid using the existing + # sampler and sampling metadata. + sampling_metadata = SamplingMetadata( + seq_groups=[], + selected_token_indices=logits_indices, + categorized_sample_indices={}, + num_prompts=attn_metadata.num_prefills, + ) + + # Skip this in memory profiling at initialization. + if kv_caches[0][0] is not None: + # index_copy_(slot_mapping) only works when the inserted dimension + # is 0. However, the KV cache in the Pallas backend has the shape + # [num_kv_heads, num_blocks, block_size, head_size]. To make it + # work, we need to flatten the first three dimensions and modify + # the slot_mapping accordingly. + num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape + slot_mapping = attn_metadata.slot_mapping + slot_mapping = slot_mapping.flatten() + head_indicies = torch.arange(0, + num_kv_heads, + device=slot_mapping.device, + dtype=slot_mapping.dtype) + head_indicies *= block_size * num_blocks + slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( + -1, num_kv_heads) + slot_mapping = slot_mapping + head_indicies.view(1, -1) + slot_mapping = slot_mapping.flatten() + attn_metadata.slot_mapping = slot_mapping + + hidden_states = self.model( + token_ids, + position_ids, + kv_caches, + attn_metadata, + ) + hidden_states = hidden_states.flatten(0, 1) + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + logits = logits / t.unsqueeze(dim=1) + # FIXME(woosuk): Disabled top-p sampling since it's too slow. + # logits = _apply_top_p(logits, p.unsqueeze(dim=1)) + probs = torch.softmax(logits, dim=-1, dtype=torch.float32) + # FIXME(woosuk): best_of > 1 is not supported. + next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1) + return next_token_ids + + +def _get_padded_prefill_len(x: int) -> int: + # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence + # length to be a multiple of 16. We pad the prompt length to the nearest + # multiple of 16. This is also good for performance. + if x <= 16: + return 16 + return 1 << (x - 1).bit_length() + + +def _get_padded_batch_size(batch_size: int) -> int: + if batch_size <= 2: + return batch_size + elif batch_size <= 4: + return 4 + elif batch_size <= 8: + return 8 + else: + return ((batch_size + 15) // 16) * 16 + + +def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor: + logits_sorted = torch.sort(logits, dim=-1, descending=True).values + sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1) + cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True) + cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index) + logits = logits.masked_fill_(logits < cutoff_logit, -float("inf")) + return logits diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py new file mode 100644 index 000000000..04576015d --- /dev/null +++ b/vllm/worker/tpu_worker.py @@ -0,0 +1,198 @@ +import os +from typing import List, Optional, Tuple + +import torch +import torch_xla.core.xla_model as xm +import torch_xla.runtime as xr + +import vllm.envs as envs +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size +from vllm.worker.tpu_model_runner import TPUModelRunner +from vllm.worker.worker_base import LoraNotSupportedWorkerBase + +logger = init_logger(__name__) + + +class TPUWorker(LoraNotSupportedWorkerBase): + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + vision_language_config: Optional[VisionLanguageConfig], + local_rank: int, + rank: int, + distributed_init_method: str, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.vision_language_config = vision_language_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + + assert self.device_config.device_type == "tpu" + if self.cache_config.cache_dtype == "auto": + self.cache_dtype = self.model_config.dtype + else: + self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + self.cache_config.cache_dtype] + + self.model_runner = TPUModelRunner(model_config, parallel_config, + scheduler_config, device_config, + cache_config, load_config, + vision_language_config) + + def init_device(self) -> None: + os.environ["PJRT_DEVICE"] = "TPU" + self.device = xm.xla_device() + self.device_config.device = self.device + torch.set_grad_enabled(False) + torch.set_default_dtype(self.model_config.dtype) + + # NOTE(woosuk): This is just a hack to initialize the TP group. + # This cannot perform the actual communication ops. + init_distributed_environment( + world_size=self.parallel_config.world_size, + rank=self.rank, + local_rank=self.local_rank, + distributed_init_method=self.distributed_init_method, + backend="gloo", + ) + ensure_model_parallel_initialized( + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size) + + # Set random seed. + set_random_seed(self.model_config.seed) + xm.set_rng_state(self.model_config.seed, self.device) + + # Increase the cache size limit, which is the maximum number of + # dynamo graphs that can be compiled. + # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and + # 30-40 graphs for decode. 128 is an arbitrary safe number. + torch._dynamo.config.cache_size_limit = 128 + # Use persistent cache to avoid XLA recompilation. + # NOTE(woosuk): This does not completely eliminate the recompilation + # overhead because dynamo does not cache the compiled results. + xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH), + readonly=False) + + def load_model(self): + self.model_runner.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + num_layers = self.model_config.get_num_layers(self.parallel_config) + head_size = self.model_config.get_head_size() + num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) + + kv_caches = [(None, None) for _ in range(num_layers)] + self.model_runner._dummy_run( + batch_size=1, + seq_len=self.scheduler_config.max_num_batched_tokens, + kv_caches=kv_caches, + is_prompt=True, + ) + # Synchronize before measuring the memory usage. + xm.wait_device_ops() + + m = xm.get_memory_info(self.device) + program_size = 1024 * 1024 * 1024 # 1GB + free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0) + kv_cache_bytes = int(free_bytes * + self.cache_config.gpu_memory_utilization) + kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype) + block_size = self.cache_config.block_size + num_tpu_blocks = (kv_cache_bytes // + (kv_cache_dtype_btyes * block_size * num_layers * 2 * + head_size * num_kv_heads)) + num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. + return num_tpu_blocks, 0 + + def initialize_cache( + self, + num_gpu_blocks: int, + num_cpu_blocks: int, + ) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + self.block_size = self.cache_config.block_size + + dtype = self.cache_dtype + num_layers = self.model_config.get_num_layers(self.parallel_config) + num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) + head_size = self.model_config.get_head_size() + + self.tpu_cache = [] + tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( + num_gpu_blocks, self.block_size, num_kv_heads, head_size) + for _ in range(num_layers): + key_cache = torch.zeros(tpu_cache_shape, + dtype=dtype, + device=self.device) + value_cache = torch.zeros_like(key_cache) + self.tpu_cache.append((key_cache, value_cache)) + self._warmup_model() + + def _warmup_model(self) -> None: + # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined + # for CUDA graphs. We should refactor this part. + if not self.model_config.enforce_eager: + # Warm up the model with all possible input shapes so that + # compilation never happens during the actual execution. + # This may take ~30 mins for the first run and ~20 mins for the + # subsequent runs. + # If `enforce_eager` is True, the ahead-of-time compilation is + # skipped and the compilation happens during the actual execution, + # which is bad for performance but useful for development. + self.model_runner.warmup_model(self.tpu_cache) + + def get_cache_block_size_bytes(self) -> int: + head_size = self.model_config.get_head_size() + num_heads = self.model_config.get_num_kv_heads(self.parallel_config) + num_layers = self.model_config.get_num_layers(self.parallel_config) + + key_cache_block = self.cache_config.block_size * num_heads * head_size + value_cache_block = key_cache_block + total = num_layers * (key_cache_block + value_cache_block) + dtype_size = get_dtype_size(self.cache_dtype) + return dtype_size * total + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if execute_model_req is None: + return [] + + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + num_seq_groups = len(seq_group_metadata_list) + if num_seq_groups == 0: + return [] + + # Currently, TPUWorker does not support swapping. + # TODO(woosuk): Support block copying. + assert len(execute_model_req.blocks_to_swap_in) == 0, ( + "Swapping is not supported for the TPU backend.") + assert len(execute_model_req.blocks_to_swap_out) == 0, ( + "Swapping is not supported for the TPU backend.") + assert len(execute_model_req.blocks_to_copy) == 0 + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.tpu_cache) + return [output] -- GitLab From c3c2903e72c6e85a81ff6de8b879f4c82e8ad364 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 13 Jun 2024 03:58:53 +0800 Subject: [PATCH 005/376] [Bugfix] Add device assertion to TorchSDPA (#5402) --- vllm/attention/selector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 3f0e29c73..8b07fb2d7 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -58,6 +58,9 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: + # TODO: make XPU backend available here. + assert is_cpu(), RuntimeError( + "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend return TorchSDPABackend -- GitLab From 8b82a89997826af8e0e4ecfaaed60f3b28b1baed Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 12 Jun 2024 14:00:18 -0700 Subject: [PATCH 006/376] [ci] Add AMD, Neuron, Intel tests for AWS CI and turn off default soft fail for GPU tests (#5464) Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 3b5d36b24..645747ddd 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -19,6 +19,34 @@ steps: limit: 5 - wait + - group: "AMD Tests" + depends_on: ~ + steps: + {% for step in steps %} + {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} + - label: "AMD: {{ step.label }}" + agents: + queue: amd + command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" + env: + DOCKER_BUILDKIT: "1" + soft_fail: true + {% endif %} + {% endfor %} + + - label: "Neuron Test" + depends_on: ~ + agents: + queue: neuron + command: bash .buildkite/run-neuron-test.sh + soft_fail: false + + - label: "Intel Test" + depends_on: ~ + agents: + queue: intel + command: bash .buildkite/run-cpu-test.sh + {% for step in steps %} - label: "{{ step.label }}" agents: @@ -31,7 +59,7 @@ steps: {% else %} queue: gpu_1_queue {% endif %} - soft_fail: true + soft_fail: {{ step.soft_fail or false }} {% if step.parallelism %} parallelism: {{ step.parallelism }} {% endif %} -- GitLab From 5985e3427dc4a10b8483fd08013fa8df563f04fb Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 12 Jun 2024 14:07:26 -0700 Subject: [PATCH 007/376] [Kernel] Vectorized FP8 quantize kernel (#5396) Inspired by #5146, this PR improves FP8 quantize kernel by vectorizing data transfer to better utilize memory bandwidth. Microbenchmark shows that this improved kernel can achieve 1.0x-1.5x speedup (especially when hidden size is large). In details, we applied 3 optimizations: - Use inverted scale so that most divisions are changed to multiplications. - Unroll the loop by 4 times to improve ILP. - Use vectorized 4 to transfer data between HBM and SRAM. --- csrc/quantization/fp8/common.cu | 53 +++++++++++++++++++++++++++++---- tests/quantization/test_fp8.py | 47 +++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 8c5b693bf..6120086d7 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -23,8 +23,8 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { template __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion( - const scalar_t val, const float scale) { - float x = static_cast(val) / scale; + const scalar_t val, const float inverted_scale) { + float x = static_cast(val) * inverted_scale; float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); return static_cast(r); } @@ -71,15 +71,56 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, } } +template +struct __align__(8) vec4_t { + scalar_t x; + scalar_t y; + scalar_t z; + scalar_t w; +}; + +typedef struct __align__(4) { + c10::Float8_e4m3fn x; + c10::Float8_e4m3fn y; + c10::Float8_e4m3fn z; + c10::Float8_e4m3fn w; +} +float8x4_t; + template __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out, const scalar_t* __restrict__ input, const float* __restrict__ scale, int64_t num_elems) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - while (i < num_elems) { - out[i] = scaled_fp8_conversion(input[i], *scale); - i += blockDim.x * gridDim.x; + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + // Invert the scale so that we can use multiplications to avoid expensive + // division. + const float inverted_scale = 1.0f / (*scale); + + // Vectorized input/output to better utilize memory bandwidth. + const vec4_t* vectorized_in = + reinterpret_cast*>(input); + float8x4_t* vectorized_out = reinterpret_cast(out); + + int num_vec_elems = num_elems >> 2; + +#pragma unroll 4 + for (int i = tid; i < num_vec_elems; i += blockDim.x * gridDim.x) { + vec4_t in_vec = vectorized_in[i]; + float8x4_t out_vec; + + out_vec.x = scaled_fp8_conversion(in_vec.x, inverted_scale); + out_vec.y = scaled_fp8_conversion(in_vec.y, inverted_scale); + out_vec.z = scaled_fp8_conversion(in_vec.z, inverted_scale); + out_vec.w = scaled_fp8_conversion(in_vec.w, inverted_scale); + vectorized_out[i] = out_vec; + } + + // Handle the remaining elements if num_elems is not divisible by 4 + for (int i = num_vec_elems * 4 + tid; i < num_elems; + i += blockDim.x * gridDim.x) { + out[i] = scaled_fp8_conversion(input[i], inverted_scale); } } diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index fccce7f7b..7cb65326c 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_fp8.py --forked`. import pytest import torch +from vllm._custom_ops import scaled_fp8_quant from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod @@ -22,3 +23,49 @@ def test_load_fp16_model(vllm_runner) -> None: fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, Fp8LinearMethod) assert fc1.weight.dtype == torch.float8_e4m3fn + + +@pytest.mark.skipif( + capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), + reason="FP8 is not supported on this GPU type.") +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +def test_scaled_fp8_quant(dtype) -> None: + + def quantize_ref(tensor, inv_scale): + # The reference implementation that fully aligns to + # the kernel being tested. + finfo = torch.finfo(torch.float8_e4m3fn) + scale = inv_scale.reciprocal() + qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, + max=finfo.max) + qweight = qweight.to(torch.float8_e4m3fn) + return qweight + + def per_tensor_dequantize(tensor, inv_scale, dtype): + fake_qweight = tensor.to(dtype) + dq_weight = fake_qweight * inv_scale + return dq_weight + + # Note that we use a shape % 4 != 0 to cover edge cases, + # because scaled_fp8_quant is vectorized by 4. + x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype) + + # Dynamic quantization + ref_y, inv_scale = scaled_fp8_quant(x, None) + ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype) + + # Reference dynamic quantizaton + y = quantize_ref(x, inv_scale) + assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) + + # Static quantization + y, _ = scaled_fp8_quant(x, inv_scale) + assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) + + # Padding + y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17) + assert y.shape[0] == 17 + assert torch.allclose( + ref_y, + per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, + dtype)) -- GitLab From 5cc50a531f720758025c8493ee85a56272277a54 Mon Sep 17 00:00:00 2001 From: Arthur Kim Date: Thu, 13 Jun 2024 06:08:52 +0900 Subject: [PATCH 008/376] [Bugfix] TYPE_CHECKING for MultiModalData (#5444) --- vllm/inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs.py b/vllm/inputs.py index 85c9cd84f..026903e19 100644 --- a/vllm/inputs.py +++ b/vllm/inputs.py @@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, from typing_extensions import NotRequired if TYPE_CHECKING: - from vllm.sequence import MultiModalData + from vllm.multimodal import MultiModalData class ParsedText(TypedDict): -- GitLab From 51602eefd38250325e541abd28f051ffd7676c3f Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 12 Jun 2024 15:13:52 -0600 Subject: [PATCH 009/376] [Frontend] [Core] Support for sharded tensorized models (#4990) Signed-off-by: Travis Johnson Co-authored-by: Sanger Steel Co-authored-by: Roger Wang --- examples/tensorize_vllm_model.py | 125 +++++++++--------- tests/tensorizer_loader/test_tensorizer.py | 99 ++++++++++++-- vllm/model_executor/model_loader/loader.py | 18 ++- .../model_executor/model_loader/tensorizer.py | 107 ++++++++++----- vllm/worker/model_runner.py | 11 ++ vllm/worker/worker.py | 8 ++ 6 files changed, 261 insertions(+), 107 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 8b74ae1d7..f9ed5fe08 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -3,18 +3,12 @@ import dataclasses import json import os import uuid -from functools import partial - -from tensorizer import stream_io from vllm import LLM -from vllm.distributed import (init_distributed_environment, - initialize_model_parallel) from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, TensorizerConfig, - serialize_vllm_model) + tensorize_vllm_model) # yapf conflicts with isort for this docstring # yapf: disable @@ -61,6 +55,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them. You can also provide a `--keyfile` argument to decrypt the model weights if they were serialized with encryption. +To support distributed tensor-parallel models, each model shard will be +serialized to a separate file. The tensorizer_uri is then specified as a string +template with a format specifier such as '%03d' that will be rendered with the +shard's rank. Sharded models serialized with this script will be named as +model-rank-%03d.tensors + For more information on the available arguments for serializing, run `python -m examples.tensorize_vllm_model serialize --help`. @@ -168,77 +168,72 @@ def parse_args(): def deserialize(): llm = LLM(model=args.model, load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, model_loader_extra_config=tensorizer_config ) return llm +if __name__ == '__main__': + args = parse_args() -args = parse_args() - -s3_access_key_id = (getattr(args, 's3_access_key_id', None) - or os.environ.get("S3_ACCESS_KEY_ID", None)) -s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) - or os.environ.get("S3_SECRET_ACCESS_KEY", None)) -s3_endpoint = (getattr(args, 's3_endpoint', None) - or os.environ.get("S3_ENDPOINT_URL", None)) - -credentials = { - "s3_access_key_id": s3_access_key_id, - "s3_secret_access_key": s3_secret_access_key, - "s3_endpoint": s3_endpoint -} + s3_access_key_id = (getattr(args, 's3_access_key_id', None) + or os.environ.get("S3_ACCESS_KEY_ID", None)) + s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) + or os.environ.get("S3_SECRET_ACCESS_KEY", None)) + s3_endpoint = (getattr(args, 's3_endpoint', None) + or os.environ.get("S3_ENDPOINT_URL", None)) -_read_stream, _write_stream = (partial( - stream_io.open_stream, - mode=mode, - s3_access_key_id=s3_access_key_id, - s3_secret_access_key=s3_secret_access_key, - s3_endpoint=s3_endpoint, -) for mode in ("rb", "wb+")) + credentials = { + "s3_access_key_id": s3_access_key_id, + "s3_secret_access_key": s3_secret_access_key, + "s3_endpoint": s3_endpoint + } -model_ref = args.model + model_ref = args.model -model_name = model_ref.split("/")[1] + model_name = model_ref.split("/")[1] -os.environ["MASTER_ADDR"] = "127.0.0.1" -os.environ["MASTER_PORT"] = "8080" + keyfile = args.keyfile if args.keyfile else None -init_distributed_environment(world_size=1, rank=0, local_rank=0) -initialize_model_parallel() + if args.model_loader_extra_config: + config = json.loads(args.model_loader_extra_config) + tensorizer_args = \ + TensorizerConfig(**config)._construct_tensorizer_args() + tensorizer_args.tensorizer_uri = args.path_to_tensors + else: + tensorizer_args = None -keyfile = args.keyfile if args.keyfile else None + if args.command == "serialize": + eng_args_dict = {f.name: getattr(args, f.name) for f in + dataclasses.fields(EngineArgs)} + engine_args = EngineArgs.from_cli_args( + argparse.Namespace(**eng_args_dict) + ) -if args.model_loader_extra_config: - config = json.loads(args.model_loader_extra_config) - tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args() - tensorizer_args.tensorizer_uri = args.path_to_tensors -else: - tensorizer_args = None - -if args.command == "serialize": - eng_args_dict = {f.name: getattr(args, f.name) for f in - dataclasses.fields(EngineArgs)} - - engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict)) - engine = LLMEngine.from_engine_args(engine_args) + input_dir = args.serialized_directory.rstrip('/') + suffix = args.suffix if args.suffix else uuid.uuid4().hex + base_path = f"{input_dir}/vllm/{model_ref}/{suffix}" + if engine_args.tensor_parallel_size > 1: + model_path = f"{base_path}/model-rank-%03d.tensors" + else: + model_path = f"{base_path}/model.tensors" - input_dir = args.serialized_directory.rstrip('/') - suffix = args.suffix if args.suffix else uuid.uuid4().hex - base_path = f"{input_dir}/vllm/{model_ref}/{suffix}" - model_path = f"{base_path}/model.tensors" - tensorizer_config = TensorizerConfig( - tensorizer_uri=model_path, - **credentials) - serialize_vllm_model(engine, tensorizer_config, keyfile) -elif args.command == "deserialize": - if not tensorizer_args: tensorizer_config = TensorizerConfig( - tensorizer_uri=args.path_to_tensors, - encryption_keyfile = keyfile, - **credentials - ) - deserialize() -else: - raise ValueError("Either serialize or deserialize must be specified.") + tensorizer_uri=model_path, + encryption_keyfile=keyfile, + **credentials) + + tensorize_vllm_model(engine_args, tensorizer_config) + + elif args.command == "deserialize": + if not tensorizer_args: + tensorizer_config = TensorizerConfig( + tensorizer_uri=args.path_to_tensors, + encryption_keyfile = keyfile, + **credentials + ) + deserialize() + else: + raise ValueError("Either serialize or deserialize must be specified.") diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 3f2017452..9656cf5f4 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,21 +1,27 @@ import json import os +import pathlib import subprocess from unittest.mock import MagicMock, patch import openai import pytest import ray +import torch +from tensorizer import EncryptionParams from vllm import SamplingParams +from vllm.engine.arg_utils import EngineArgs # yapf: disable from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, is_vllm_tensorized, load_with_tensorizer, open_stream, - serialize_vllm_model) + serialize_vllm_model, + tensorize_vllm_model) +from ..conftest import VllmRunner, cleanup from ..utils import ServerRunner # yapf conflicts with isort for this docstring @@ -42,6 +48,20 @@ def is_curl_installed(): except (subprocess.CalledProcessError, FileNotFoundError): return False +def get_torch_model(vllm_runner: VllmRunner): + return vllm_runner \ + .model \ + .llm_engine \ + .model_executor \ + .driver_worker \ + .model_runner \ + .model + +def write_keyfile(keyfile_path: str): + encryption_params = EncryptionParams.random() + pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) + with open(keyfile_path, 'wb') as f: + f.write(encryption_params.key) @pytest.fixture(autouse=True) def tensorizer_config(): @@ -88,12 +108,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") + write_keyfile(key_path) + outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig(tensorizer_uri=model_path) - serialize_vllm_model(vllm_model.model.llm_engine, - config_for_serializing, - encryption_key_path=key_path) + config_for_serializing = TensorizerConfig( + tensorizer_uri=model_path, + encryption_keyfile=key_path + ) + serialize_vllm_model(get_torch_model(vllm_model), + config_for_serializing) + config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -145,7 +170,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(vllm_model.model.llm_engine, + serialize_vllm_model(get_torch_model(vllm_model), TensorizerConfig(tensorizer_uri=model_path)) with vllm_runner( @@ -180,7 +205,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(vllm_model.model.llm_engine, + serialize_vllm_model(get_torch_model(vllm_model), TensorizerConfig(tensorizer_uri=model_path)) model_loader_extra_config = { @@ -224,7 +249,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) -def test_tensorizer_with_tp(vllm_runner): +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Requires 2 GPUs") +def test_tensorizer_with_tp_path_without_template(vllm_runner): with pytest.raises(ValueError): model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" @@ -238,8 +265,62 @@ def test_tensorizer_with_tp(vllm_runner): s3_endpoint="object.ord1.coreweave.com", ), tensor_parallel_size=2, + disable_custom_all_reduce=True, ) +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Requires 2 GPUs") +def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, + tmp_path): + model_ref = "EleutherAI/pythia-1.4b" + # record outputs from un-sharded un-tensorized model + base_model = vllm_runner( + model_ref, + disable_custom_all_reduce=True, + enforce_eager=True, + ) + outputs = base_model.generate(prompts, sampling_params) + + base_model.model.llm_engine.model_executor.shutdown() + del base_model + cleanup() + ray.shutdown() + + # load model with two shards and serialize with encryption + model_path = str(tmp_path / (model_ref + "-%02d.tensors")) + key_path = tmp_path / (model_ref + ".key") + + tensorizer_config = TensorizerConfig( + tensorizer_uri=model_path, + encryption_keyfile=key_path, + ) + + tensorize_vllm_model( + engine_args=EngineArgs( + model=model_ref, + tensor_parallel_size=2, + disable_custom_all_reduce=True, + enforce_eager=True, + ), + tensorizer_config=tensorizer_config, + ) + assert os.path.isfile(model_path % 0), "Serialization subprocess failed" + assert os.path.isfile(model_path % 1), "Serialization subprocess failed" + cleanup() + ray.shutdown() + + loaded_vllm_model = vllm_runner( + model_ref, + tensor_parallel_size=2, + load_format="tensorizer", + disable_custom_all_reduce=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config) + + deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) + + assert outputs == deserialized_outputs + def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): model_ref = "facebook/opt-125m" @@ -248,7 +329,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(vllm_model.model.llm_engine, config) + serialize_vllm_model(get_torch_model(vllm_model), config) assert is_vllm_tensorized(config) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index f4c3dcbac..06de2fcc1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, - tensorizer_weights_iterator) + serialize_vllm_model, tensorizer_weights_iterator) from vllm.model_executor.model_loader.utils import (get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( @@ -392,6 +392,12 @@ class TensorizerLoader(BaseModelLoader): cache_config: CacheConfig) -> nn.Module: self._verify_config(model_config, parallel_config) + if parallel_config.tensor_parallel_size > 1: + from vllm.distributed import get_tensor_model_parallel_rank + self.tensorizer_config.tensorizer_uri = \ + self.tensorizer_config.tensorizer_uri \ + % get_tensor_model_parallel_rank() + if is_vllm_tensorized(self.tensorizer_config): return self._load_model_serialized(model_config, device_config, lora_config, @@ -402,6 +408,16 @@ class TensorizerLoader(BaseModelLoader): vision_language_config, cache_config) + @staticmethod + def save_model( + model: torch.nn.Module, + tensorizer_config: TensorizerConfig, + ) -> None: + serialize_vllm_model( + model=model, + tensorizer_config=tensorizer_config, + ) + class ShardedStateLoader(BaseModelLoader): """ diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 2cf4ce5f8..d79fedaea 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -2,11 +2,11 @@ import argparse import dataclasses import io import os +import re import time -import typing from dataclasses import dataclass from functools import partial -from typing import Generator, Optional, Tuple, Type, Union +from typing import BinaryIO, Generator, Optional, Tuple, Type, Union import torch from torch import nn @@ -14,6 +14,7 @@ from transformers import PretrainedConfig import vllm.envs as envs from vllm.config import ModelConfig, ParallelConfig +from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -48,8 +49,7 @@ logger = init_logger(__name__) @dataclass class TensorizerConfig: - tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO, - str, bytes, os.PathLike, int] + tensorizer_uri: str vllm_tensorized: Optional[bool] = False verify_hash: Optional[bool] = False num_readers: Optional[int] = None @@ -60,6 +60,12 @@ class TensorizerConfig: model_class: Optional[Type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None + _is_sharded: bool = False + + def __post_init__(self): + # check if the configuration is for a sharded vLLM model + self._is_sharded = isinstance(self.tensorizer_uri, str) \ + and re.search(r'%0\dd', self.tensorizer_uri) is not None def _construct_tensorizer_args(self) -> "TensorizerArgs": tensorizer_args = { @@ -78,13 +84,12 @@ class TensorizerConfig: self, parallel_config: "ParallelConfig", ) -> None: - if (parallel_config.tensor_parallel_size > 1 - and self.tensorizer_uri is not None): + if parallel_config.tensor_parallel_size > 1 \ + and not self._is_sharded: raise ValueError( - "Loading to multiple GPUs is not currently supported with " - "vLLM-serialized models. Please set tensor_parallel_size=1." - " or use a non-vLLM-serialized model, such as a " - "serialized Hugging Face `PretrainedModel`.") + "For a sharded model, tensorizer_uri should include a" + " string format template like '%04d' to be formatted" + " with the rank of the shard") def verify_with_model_config(self, model_config: "ModelConfig") -> None: if (model_config.quantization is not None @@ -102,8 +107,8 @@ def load_with_tensorizer(tensorizer_config: TensorizerConfig, @dataclass class TensorizerArgs: - tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO, - str, bytes, os.PathLike, int] + tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str, + bytes, os.PathLike, int] vllm_tensorized: Optional[bool] = False verify_hash: Optional[bool] = False num_readers: Optional[int] = None @@ -332,6 +337,7 @@ class TensorizerAgent: ) as stream, TensorDeserializer( stream, dtype=self.tensorizer_config.dtype, + device=f'cuda:{torch.cuda.current_device()}', **self.tensorizer_args.deserializer_params) as deserializer: deserializer.load_into_module(self.model) end = time.perf_counter() @@ -400,33 +406,70 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool: return False -def get_pretensorized_vllm_model(engine: "LLMEngine") -> nn.Module: - model = (engine.model_executor.driver_worker.model_runner.model) +def serialize_vllm_model( + model: nn.Module, + tensorizer_config: TensorizerConfig, +) -> nn.Module: model.register_parameter( "vllm_tensorized_marker", nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False)) - return model - - -def serialize_vllm_model(engine: "LLMEngine", - tensorizer_config : TensorizerConfig, - encryption_key_path: Optional[str] = None) \ - -> nn.Module: - - model = get_pretensorized_vllm_model(engine) tensorizer_args = tensorizer_config._construct_tensorizer_args() + encryption_params = None - if encryption_key_path is not None: - encryption_params = EncryptionParams.random() - with _write_stream(encryption_key_path, - **tensorizer_args.stream_params) as stream: - stream.write(encryption_params.key) + if (keyfile := tensorizer_config.encryption_keyfile) is not None: + with open(keyfile, "rb") as f: + key = f.read() + encryption_params = EncryptionParams(key=key) - with _write_stream(tensorizer_args.tensorizer_uri, - **tensorizer_args.stream_params) as stream: + output_file = tensorizer_args.tensorizer_uri + if tensorizer_config._is_sharded: + from vllm.distributed import get_tensor_model_parallel_rank + output_file = output_file % get_tensor_model_parallel_rank() + + with _write_stream(output_file, **tensorizer_args.stream_params) as stream: serializer = TensorSerializer(stream, encryption=encryption_params) serializer.write_module(model) serializer.close() - logger.info("Successfully serialized model to %s", - str(tensorizer_args.tensorizer_uri)) + logger.info("Successfully serialized model to %s", str(output_file)) return model + + +def tensorize_vllm_model(engine_args: EngineArgs, + tensorizer_config: TensorizerConfig, + generate_keyfile: bool = True): + """Utility to load a model and then serialize it with Tensorizer + + Intended to be used separately from running a vLLM server since it + creates its own Engine instance. + """ + engine_config = engine_args.create_engine_config() + tensorizer_config.verify_with_model_config(engine_config.model_config) + tensorizer_config.verify_with_parallel_config( + engine_config.parallel_config) + + # generate the encryption key before creating the engine to support sharding + if generate_keyfile and (keyfile := + tensorizer_config.encryption_keyfile) is not None: + encryption_params = EncryptionParams.random() + with _write_stream( + keyfile, + s3_access_key_id=tensorizer_config.s3_access_key_id, + s3_secret_access_key=tensorizer_config.s3_secret_access_key, + s3_endpoint=tensorizer_config.s3_endpoint, + ) as stream: + stream.write(encryption_params.key) + + engine = LLMEngine.from_engine_args(engine_args) + if tensorizer_config._is_sharded: + # if the engine is a distributed engine (for tensor parallel) then each + # worker shard needs to serialize its part of the model. + engine.model_executor._run_workers( + "save_tensorized_model", + tensorizer_config=tensorizer_config, + ) + else: + # with a single worker, we can get to the underlying model directly + serialize_vllm_model( + engine.model_executor.driver_worker.model_runner.model, + tensorizer_config, + ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 99b12293a..de616ef1d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -20,6 +20,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sampling_params import SamplingParams from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata @@ -222,6 +223,16 @@ class ModelRunner: max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: TensorizerConfig, + ) -> None: + from vllm.model_executor.model_loader.loader import TensorizerLoader + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + ) + def get_max_block_per_batch(self) -> int: block_size = self.block_size return (self.max_seq_len_to_capture + block_size - 1) // block_size diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 10411a2bf..7a378a862 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -15,6 +15,7 @@ from vllm.distributed import (broadcast_tensor_dict, set_custom_all_reduce) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner @@ -132,6 +133,13 @@ class Worker(WorkerBase): max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: TensorizerConfig, + ) -> None: + self.model_runner.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + @torch.inference_mode() def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many -- GitLab From 622d45128c02e5296e1177481c65199754eab396 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 12 Jun 2024 14:46:35 -0700 Subject: [PATCH 010/376] [misc] add hint for AttributeError (#5462) --- vllm/_custom_ops.py | 48 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 440b0e8af..955086be1 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,13 +1,16 @@ import contextlib +import functools from typing import List, Optional, Tuple, Type import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + try: import vllm._C except ImportError as e: - from vllm.logger import init_logger - logger = init_logger(__name__) logger.warning("Failed to import from vllm._C with %r", e) with contextlib.suppress(ImportError): @@ -23,6 +26,25 @@ def is_custom_op_supported(op_name: str) -> bool: return op is not None +def hint_on_error(fn): + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + try: + return fn(*args, **kwargs) + except AttributeError as e: + msg = ( + "Error in calling custom op %s: %s\n" + "Possibly you have built or installed an obsolete version of vllm.\n" + "Please try a clean build and install of vllm," + "or remove old built files such as vllm/*cpython*.so and build/ ." + ) + logger.error(msg, fn.__name__, e) + raise e + + return wrapper + + # activation ops def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: torch.ops._C.silu_and_mul(out, x) @@ -459,3 +481,25 @@ def dispatch_bgmv_low_level( h_out, y_offset, ) + + +# temporary fix for https://github.com/vllm-project/vllm/issues/5456 +# TODO: remove this in v0.6.0 +names_and_values = globals() +names_and_values_to_update = {} +# prepare variables to avoid dict size change during iteration +k, v, arg = None, None, None +fn_type = type(lambda x: x) +for k, v in names_and_values.items(): + # find functions that are defined in this file and have torch.Tensor + # in their annotations. `arg == "torch.Tensor"` is used to handle + # the case when users use `import __annotations__` to turn type + # hints into strings. + if isinstance(v, fn_type) \ + and v.__code__.co_filename == __file__ \ + and any(arg is torch.Tensor or arg == "torch.Tensor" + for arg in v.__annotations__.values()): + names_and_values_to_update[k] = hint_on_error(v) + +names_and_values.update(names_and_values_to_update) +del names_and_values_to_update, names_and_values, v, k, fn_type -- GitLab From b8d4dfff9c29ad6e02bce1fc79c089120b2d34d6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 13 Jun 2024 05:49:31 +0800 Subject: [PATCH 011/376] [Doc] Update debug docs (#5438) --- docs/source/getting_started/debugging.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 3e4d0362e..ff37f4e62 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -8,27 +8,30 @@ Debugging hang/crash issues When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time: -- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli `_ and then use the local path to the model. This way, you can isolate the issue. -- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. -- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli `_ and then use the local path to the model. This way, you can isolate the issue. +- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. +- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. -If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: +If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: - Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. - Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble. - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.** +- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. + + .. warning:: + vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes. With more logging, hopefully you can find the root cause of the issue. Here are some common issues that can cause hangs: -- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. -- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly. +- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. +- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly. .. code-block:: python - # save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py` + # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py` # adjust `--nproc-per-node` to the number of GPUs you want to use. import torch import torch.distributed as dist @@ -39,4 +42,4 @@ Here are some common issues that can cause hangs: value = data.mean().item() assert value == dist.get_world_size() -If the problem persists, feel free to open an `issue `_ on GitHub, with a detailed description of the issue, your environment, and the logs. +If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. -- GitLab From 94a07bbdd813a0121d01a852ab03fb2430e73548 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 12 Jun 2024 17:59:44 -0400 Subject: [PATCH 012/376] [Bugfix] Fix typo in scheduler.py (requeset -> request) (#5470) --- vllm/core/scheduler.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index bb37c5f31..48c34625c 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -50,8 +50,8 @@ class SchedulingBudget: """ token_budget: int max_num_seqs: int - _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set) - _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set) + _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) + _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) _num_batched_tokens: int = 0 _num_curr_seqs: int = 0 @@ -65,28 +65,28 @@ class SchedulingBudget: return self.token_budget - self.num_batched_tokens def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int): - if req_id in self._requeset_ids_num_batched_tokens: + if req_id in self._request_ids_num_batched_tokens: return - self._requeset_ids_num_batched_tokens.add(req_id) + self._request_ids_num_batched_tokens.add(req_id) self._num_batched_tokens += num_batched_tokens def subtract_num_batched_tokens(self, req_id: str, num_batched_tokens: int): - if req_id in self._requeset_ids_num_batched_tokens: - self._requeset_ids_num_batched_tokens.remove(req_id) + if req_id in self._request_ids_num_batched_tokens: + self._request_ids_num_batched_tokens.remove(req_id) self._num_batched_tokens -= num_batched_tokens def add_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._requeset_ids_num_curr_seqs: + if req_id in self._request_ids_num_curr_seqs: return - self._requeset_ids_num_curr_seqs.add(req_id) + self._request_ids_num_curr_seqs.add(req_id) self._num_curr_seqs += num_curr_seqs def subtract_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._requeset_ids_num_curr_seqs: - self._requeset_ids_num_curr_seqs.remove(req_id) + if req_id in self._request_ids_num_curr_seqs: + self._request_ids_num_curr_seqs.remove(req_id) self._num_curr_seqs -= num_curr_seqs @property -- GitLab From 7d19de2e9c9a94658c36b55011b803a7991d0335 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 12 Jun 2024 18:42:12 -0400 Subject: [PATCH 013/376] [Frontend] Add "input speed" to tqdm postfix alongside output speed (#5425) --- vllm/entrypoints/llm.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 411d5256b..9e9234931 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -545,11 +545,13 @@ class LLM: total=num_requests, desc="Processed prompts", dynamic_ncols=True, - postfix=f"Generation Speed: {0:.2f} toks/s", + postfix=(f"est. speed input: {0:.2f} toks/s, " + f"output: {0:.2f} toks/s"), ) # Run the engine. outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = [] - total_toks = 0 + total_in_toks = 0 + total_out_toks = 0 while self.llm_engine.has_unfinished_requests(): step_outputs = self.llm_engine.step() for output in step_outputs: @@ -558,10 +560,15 @@ class LLM: if use_tqdm: if isinstance(output, RequestOutput): # Calculate tokens only for RequestOutput - total_toks += sum( + total_in_toks += len(output.prompt_token_ids) + in_spd = total_in_toks / pbar.format_dict["elapsed"] + total_out_toks += sum( len(stp.token_ids) for stp in output.outputs) - spd = total_toks / pbar.format_dict["elapsed"] - pbar.postfix = f"Generation Speed: {spd:.2f} toks/s" + out_spd = total_out_toks / pbar.format_dict[ + "elapsed"] + pbar.postfix = ( + f"est. speed input: {in_spd:.2f} toks/s, " + f"output: {out_spd:.2f} toks/s") pbar.update(1) if use_tqdm: pbar.close() -- GitLab From 2135cacb457b7daf1143c8465ab72650eaa4dd7e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 13 Jun 2024 07:20:18 +0800 Subject: [PATCH 014/376] [Bugfix] Fix wrong multi_modal_input format for CPU runner (#5451) --- vllm/worker/cpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index eaf43247d..d539f5693 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -343,8 +343,8 @@ class CPUModelRunner: "kv_caches": kv_caches, "attn_metadata": attn_metadata, } - if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + if self.vision_language_config and multi_modal_input is not None: + execute_model_kwargs.update(multi_modal_input) hidden_states = model_executable(**execute_model_kwargs) -- GitLab From ea3890a5f0314e49d69afca45fe706504cb14029 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 12 Jun 2024 17:27:08 -0700 Subject: [PATCH 015/376] [Core][Distributed] code deduplication in tp&pp with coordinator(#5293) [Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293) --- tests/conftest.py | 4 +- tests/distributed/test_custom_all_reduce.py | 6 +- tests/distributed/test_pynccl.py | 12 +- tests/lora/conftest.py | 23 +- tests/worker/test_model_runner.py | 4 +- vllm/attention/backends/pallas.py | 2 +- vllm/distributed/communication_op.py | 311 +------ .../device_communicators/custom_all_reduce.py | 13 +- .../custom_all_reduce_utils.py | 7 +- .../device_communicators/pynccl.py | 11 +- vllm/distributed/parallel_state.py | 809 ++++++++++++------ vllm/worker/model_runner.py | 2 +- 12 files changed, 622 insertions(+), 582 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e0680467d..29a4f126f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,8 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, from vllm import LLM, SamplingParams from vllm.config import TokenizerPoolConfig, VisionLanguageConfig -from vllm.distributed import destroy_model_parallel +from vllm.distributed import (destroy_distributed_environment, + destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal import MultiModalData @@ -54,6 +55,7 @@ def _read_prompts(filename: str) -> List[str]: def cleanup(): destroy_model_parallel() + destroy_distributed_environment() with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 186f9faa6..3776c1f91 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -7,9 +7,9 @@ import torch import torch.distributed as dist from vllm.distributed.communication_op import ( # noqa - graph_capture, tensor_model_parallel_all_reduce) + tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, - get_tp_ca_communicator) + get_tp_group, graph_capture) from ..utils import (init_test_distributed_environment, multi_process_tensor_parallel) @@ -91,7 +91,7 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): # communicate independently num_communication = rank // tp_size + 1 sz = 1024 - fa = get_tp_ca_communicator() + fa = get_tp_group().ca_comm inp = torch.ones(sz, dtype=torch.float32, device=device) out = inp for _ in range(num_communication): diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 0218295a3..b788e253a 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -6,10 +6,11 @@ import torch import torch.distributed from vllm.distributed.communication_op import ( # noqa - graph_capture, tensor_model_parallel_all_reduce) + tensor_model_parallel_all_reduce) from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + get_world_group, graph_capture, init_distributed_environment) from vllm.utils import update_environment_variables @@ -53,7 +54,8 @@ def worker_fn_wrapper(fn): @worker_fn_wrapper def worker_fn(): - pynccl_comm = PyNcclCommunicator() + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): @@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm(): def worker_fn_with_cudagraph(): with torch.no_grad(): graph = torch.cuda.CUDAGraph() - pynccl_comm = PyNcclCommunicator() + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() @@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph(): @worker_fn_wrapper def send_recv_worker_fn(): - pynccl_comm = PyNcclCommunicator() + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) if pynccl_comm.rank == 0: tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 400333066..522c635b8 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download import vllm from vllm.config import LoRAConfig -from vllm.distributed import destroy_model_parallel, initialize_model_parallel +from vllm.distributed import (destroy_distributed_environment, + destroy_model_parallel, + init_distributed_environment, + initialize_model_parallel) from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) @@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{ def cleanup(): destroy_model_parallel() + destroy_distributed_environment() with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() @@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): - if not torch.distributed.is_initialized(): - temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) - torch.distributed.all_reduce(torch.zeros(1).cuda()) + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + world_size=1, + rank=0, + distributed_init_method=f"file://{temp_file}", + local_rank=0, + backend="nccl", + ) initialize_model_parallel(1, 1) yield cleanup() diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 92de545ac..514a57e17 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,7 +1,8 @@ import pytest import torch -from vllm.distributed.parallel_state import init_distributed_environment +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + init_distributed_environment) from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata @@ -292,6 +293,7 @@ def distributed_init(): rank=0, distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}", local_rank=0) + ensure_model_parallel_initialized(1, 1) @pytest.mark.parametrize("batch_size", list(range(2, 128))) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b203c5ec5..75f246526 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -110,7 +110,7 @@ class PallasAttentionBackendImpl(AttentionImpl): raise NotImplementedError("TPU version must be 4 or higher.") self.megacore_mode = None - tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower() + tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower() if not tpu_type.endswith("lite"): if self.num_kv_heads % 2 == 0: self.megacore_mode = "kv_head" diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 2b38ec472..32394a07b 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,317 +1,32 @@ -from collections import namedtuple -from contextlib import contextmanager, nullcontext -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Optional, Union import torch -from torch.distributed import ProcessGroup +import torch.distributed -from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator, - get_tensor_model_parallel_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - get_tp_ca_communicator, - get_tp_pynccl_communicator) - - -@dataclass -class GraphCaptureContext: - stream: torch.cuda.Stream - - -@contextmanager -def graph_capture(): - """ - `graph_capture` is a context manager which should surround the code that - is capturing the CUDA graph. Its main purpose is to ensure that the - some operations will be run after the graph is captured, before the graph - is replayed. It returns a `GraphCaptureContext` object which contains the - necessary data for the graph capture. Currently, it only contains the - stream that the graph capture is running on. This stream is set to the - current CUDA stream when the context manager is entered and reset to the - default stream when the context manager is exited. This is to ensure that - the graph capture is running on a separate stream from the default stream, - in order to explicitly distinguish the kernels to capture - from other kernels possibly launched on background in the default stream. - """ - stream = torch.cuda.Stream() - graph_capture_context = GraphCaptureContext(stream) - ca_comm = get_tp_ca_communicator() - maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture() - with torch.cuda.stream(stream), maybe_ca_context: - # In graph mode, we have to be very careful about the collective - # operations. The current status is: - # allreduce \ Mode | Eager | Graph | - # -------------------------------------------- - # custom allreduce | enabled | enabled | - # PyNccl | disabled| enabled | - # torch.distributed | enabled | disabled| - # - # Note that custom allreduce will have a runtime check, if the tensor - # size is too large, it will fallback to the next available option. - # In summary: When using CUDA graph, we use - # either custom all-reduce kernel or pynccl. When not using CUDA - # graph, we use either custom all-reduce kernel or PyTorch NCCL. - # We always prioritize using custom all-reduce kernel but fall back - # to PyTorch or pynccl if it is disabled or not supported. - tp_pynccl_comm = get_tp_pynccl_communicator() - pp_pynccl_comm = get_pp_pynccl_communicator() - if not tp_pynccl_comm: - maybe_tp_pynccl_context = nullcontext() - else: - maybe_tp_pynccl_context = tp_pynccl_comm.change_state( - enable=True, stream=torch.cuda.current_stream()) - if not pp_pynccl_comm: - maybe_pp_pynccl_context = nullcontext() - else: - maybe_pp_pynccl_context = pp_pynccl_comm.change_state( - enable=True, stream=torch.cuda.current_stream()) - with maybe_tp_pynccl_context, maybe_pp_pynccl_context: - yield graph_capture_context +from .parallel_state import get_tp_group def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: - """All-reduce the input tensor across model parallel group. - - NOTE: This operation will be applied in-place on the input tensor if - disable_custom_all_reduce is set to True. Otherwise, this operation may or - may not be applied in place depending on whether custom all reduce is - invoked for a particular tensor, which further depends on the tensor size - and GPU topology. - - TLDR: always assume this function modifies its input, but use the return - value as the output. - """ - ca_comm = get_tp_ca_communicator() - - # Bypass the function if we are using only 1 GPU. - if get_tensor_model_parallel_world_size() == 1: - return input_ - if ca_comm is not None: - out = ca_comm.custom_all_reduce(input_) - if out is not None: - return out - pynccl_comm = get_tp_pynccl_communicator() - if (pynccl_comm is not None and not pynccl_comm.disabled): - pynccl_comm.all_reduce(input_) - else: - torch.distributed.all_reduce(input_, - group=get_tensor_model_parallel_group()) - return input_ + """All-reduce the input tensor across model parallel group.""" + return get_tp_group().all_reduce(input_) def tensor_model_parallel_all_gather(input_: torch.Tensor, dim: int = -1) -> torch.Tensor: """All-gather the input tensor across model parallel group.""" - world_size = get_tensor_model_parallel_world_size() - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - assert -input_.dim() <= dim < input_.dim(), ( - f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") - if dim < 0: - # Convert negative dim to positive. - dim += input_.dim() - input_size = input_.size() - # Allocate output tensor. - output_tensor = torch.empty((world_size, ) + input_size, - dtype=input_.dtype, - device=input_.device) - # All-gather. - torch.distributed.all_gather_into_tensor( - output_tensor, input_, group=get_tensor_model_parallel_group()) - # Reshape - output_tensor = output_tensor.movedim(0, dim) - output_tensor = output_tensor.reshape(input_size[:dim] + - (world_size * input_size[dim], ) + - input_size[dim + 1:]) - return output_tensor + return get_tp_group().all_gather(input_, dim) def tensor_model_parallel_gather(input_: torch.Tensor, dst: int = 0, dim: int = -1) -> torch.Tensor: - """Gather the input tensor across model parallel group. - - NOTE: We assume that the input tensor is on the same device across - all the ranks. - """ - world_size = get_tensor_model_parallel_world_size() - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - assert -input_.dim() <= dim < input_.dim(), ( - f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") - if dim < 0: - # Convert negative dim to positive. - dim += input_.dim() - # Allocate output tensor. - if get_tensor_model_parallel_rank() == dst: - gather_list = [torch.empty_like(input_) for _ in range(world_size)] - else: - gather_list = None - # Gather. - torch.distributed.gather(input_, - gather_list, - dst=dst, - group=get_tensor_model_parallel_group()) - if get_tensor_model_parallel_rank() == dst: - output_tensor = torch.cat(gather_list, dim=dim) - else: - output_tensor = None - return output_tensor - - -def broadcast(input_: torch.Tensor, - src: int = 0, - group: Optional[ProcessGroup] = None): - """Broadcast the input tensor.""" - group = group or torch.distributed.group.WORLD - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - # Bypass the function if we are using only 1 GPU. - world_size = torch.distributed.get_world_size(group=group) - if world_size == 1: - return input_ - # Broadcast. - torch.distributed.broadcast(input_, src=src, group=group) - return input_ + """Gather the input tensor across model parallel group.""" + return get_tp_group().gather(input_, dst, dim) -def broadcast_object_list(obj_list: List[Any], - src: int = 0, - group: Optional[ProcessGroup] = None): - """Broadcast the input object list.""" - group = group or torch.distributed.group.WORLD - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - # Bypass the function if we are using only 1 GPU. - world_size = torch.distributed.get_world_size(group=group) - if world_size == 1: - return obj_list - # Broadcast. - torch.distributed.broadcast_object_list(obj_list, src=src, group=group) - return obj_list - - -TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"]) - - -def _split_tensor_dict( - tensor_dict: Dict[Any, Union[torch.Tensor, Any]] -) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]: - """Split the tensor dictionary into two parts: - 1. A list of (key, value) pairs. If the value is a tensor, it is replaced - by its metadata. - 2. A list of tensors. - """ - metadata_list = [] - tensor_list = [] - for key, value in tensor_dict.items(): - if isinstance(value, torch.Tensor): - # Note: we cannot use `value.device` here, - # because it contains not only the device type but also the device - # index (e.g. "cuda:0"). We only need the device type. - # receiving side will set the device index. - device = "cpu" if value.is_cpu else "cuda" - metadata_list.append( - (key, TensorMetadata(device, value.dtype, value.size()))) - tensor_list.append(value) - else: - metadata_list.append((key, value)) - return metadata_list, tensor_list - - -def broadcast_tensor_dict( - tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, - src: int = 0, - group: Optional[ProcessGroup] = None, - metadata_group: Optional[ProcessGroup] = None -) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: - """Broadcast the input tensor dictionary. - `group` is used to broadcast the tensors, while `metadata_group` is used - to broadcast the metadata of the dict (e.g. dict structure, tensor sizes, - dtypes). - """ - # Bypass the function if we are using only 1 GPU. - if (not torch.distributed.is_initialized() - or torch.distributed.get_world_size(group=group) == 1): +def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, + Any]]] = None, + src: int = 0): + if not torch.distributed.is_initialized(): return tensor_dict - - group = group or torch.distributed.group.WORLD - metadata_group = metadata_group or get_cpu_world_group() - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - rank = torch.distributed.get_rank() - if rank == src: - metadata_list: List[Tuple[Any, Any]] = [] - assert isinstance( - tensor_dict, - dict), (f"Expecting a dictionary, got {type(tensor_dict)}") - metadata_list, tensor_list = _split_tensor_dict(tensor_dict) - # `metadata_list` lives in CPU memory. - # `broadcast_object_list` involves serialization and deserialization, - # all happening on CPU. Therefore, we can use the CPU group. - torch.distributed.broadcast_object_list([metadata_list], - src=src, - group=metadata_group) - async_handles = [] - for tensor in tensor_list: - if tensor.numel() == 0: - # Skip broadcasting empty tensors. - continue - if tensor.is_cpu: - # use metadata_group for CPU tensors - handle = torch.distributed.broadcast(tensor, - src=src, - group=metadata_group, - async_op=True) - else: - # use group for GPU tensors - handle = torch.distributed.broadcast(tensor, - src=src, - group=group, - async_op=True) - async_handles.append(handle) - for async_handle in async_handles: - async_handle.wait() - - else: - recv_metadata_list = [None] - torch.distributed.broadcast_object_list(recv_metadata_list, - src=src, - group=metadata_group) - assert recv_metadata_list[0] is not None - tensor_dict = {} - async_handles = [] - for key, value in recv_metadata_list[0]: - if isinstance(value, TensorMetadata): - tensor = torch.empty(value.size, - dtype=value.dtype, - device=value.device) - if tensor.numel() == 0: - # Skip broadcasting empty tensors. - tensor_dict[key] = tensor - continue - if tensor.is_cpu: - # use metadata_group for CPU tensors - handle = torch.distributed.broadcast(tensor, - src=src, - group=metadata_group, - async_op=True) - else: - # use group for GPU tensors - handle = torch.distributed.broadcast(tensor, - src=src, - group=group, - async_op=True) - async_handles.append(handle) - tensor_dict[key] = tensor - else: - tensor_dict[key] = value - for async_handle in async_handles: - async_handle.wait() - return tensor_dict + return get_tp_group().broadcast_tensor_dict(tensor_dict, src) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index bbc2284f8..9a2b47594 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -9,8 +9,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.distributed.device_communicators.custom_all_reduce_utils import ( gpu_p2p_access_check) -from vllm.distributed.parallel_state import ( - get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node) +from vllm.distributed.parallel_state import is_in_the_same_node from vllm.logger import init_logger try: @@ -86,8 +85,8 @@ class CustomAllreduce: # max_size: max supported allreduce size def __init__(self, - group: Optional[ProcessGroup] = None, - device: Optional[Union[int, str, torch.device]] = None, + group: ProcessGroup, + device: Union[int, str, torch.device], max_size=8192 * 1024) -> None: """ Args: @@ -107,7 +106,6 @@ class CustomAllreduce: # e.g. in a non-cuda environment return - group = group or get_tensor_model_parallel_cpu_group() self.group = group assert dist.get_backend(group) != dist.Backend.NCCL, ( @@ -134,10 +132,7 @@ class CustomAllreduce: world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES)) return - if device is None: - local_rank = get_local_rank() - device = torch.device(f"cuda:{local_rank}") - elif isinstance(device, int): + if isinstance(device, int): device = torch.device(f"cuda:{device}") elif isinstance(device, str): device = torch.device(device) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 4b89a23df..1fd0058f6 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -11,7 +11,6 @@ import torch.distributed as dist import torch.multiprocessing as mp import vllm.envs as envs -from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank from vllm.logger import init_logger logger = init_logger(__name__) @@ -162,7 +161,8 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json" ) os.makedirs(os.path.dirname(path), exist_ok=True) - if ((not is_distributed or get_local_rank() == 0) + from vllm.distributed.parallel_state import get_world_group + if ((not is_distributed or get_world_group().local_rank == 0) and (not os.path.exists(path))): # only the local master process (with local_rank == 0) can # enter this block to calculate the cache @@ -174,8 +174,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: with open(path, "w") as f: json.dump(cache, f, indent=4) if is_distributed: - cpu_world_group = get_cpu_world_group() - dist.barrier(cpu_world_group) + get_world_group().barrier() logger.info("reading GPU P2P access cache from %s", path) with open(path, "r") as f: cache = json.load(f) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index f5f1de0c7..83eec264b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -9,7 +9,6 @@ from torch.distributed import ProcessGroup, ReduceOp from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum, ncclRedOpTypeEnum, ncclUniqueId) -from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank from vllm.logger import init_logger logger = init_logger(__name__) @@ -19,8 +18,8 @@ class PyNcclCommunicator: def __init__( self, - group: Optional[ProcessGroup] = None, - device: Optional[Union[int, str, torch.device]] = None, + group: ProcessGroup, + device: Union[int, str, torch.device], library_path: Optional[str] = None, ): """ @@ -35,7 +34,6 @@ class PyNcclCommunicator: is bind to a unique device. """ assert dist.is_initialized() - group = get_cpu_world_group() if group is None else group assert dist.get_backend(group) != dist.Backend.NCCL, ( "PyNcclCommunicator should be attached to a non-NCCL group.") self.group = group @@ -77,10 +75,7 @@ class PyNcclCommunicator: byte_list = tensor.tolist() for i, byte in enumerate(byte_list): self.unique_id.internal[i] = byte - if device is None: - local_rank = get_local_rank() - device = torch.device(f"cuda:{local_rank}") - elif isinstance(device, int): + if isinstance(device, int): device = torch.device(f"cuda:{device}") elif isinstance(device, str): device = torch.device(device) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b6d1eeff0..f6a2fc9b0 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -2,81 +2,518 @@ # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -"""Tensor and pipeline parallel groups.""" +"""vLLM distributed state. +It takes over the control of the distributed environment from PyTorch. +The typical workflow is: + +- call `init_distributed_environment` to initialize the distributed environment. +- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to + initialize the model parallel groups. + +- any code dealing with the distributed stuff + +- call `destroy_model_parallel` to destroy the model parallel groups. +- call `destroy_distributed_environment` to destroy the distributed environment. + +If you only need to use the distributed environment without model/pipeline + parallelism, you can skip the model parallel initialization and destruction + steps. +""" import contextlib +from collections import namedtuple +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass from multiprocessing import resource_tracker, shared_memory -from typing import List, Optional +from typing import Any, Dict, List, Optional, Tuple, Union import torch -from torch.distributed import ProcessGroup +from torch.distributed import Backend, ProcessGroup import vllm.envs as envs from vllm.logger import init_logger -logger = init_logger(__name__) -_ENABLE_CUSTOM_ALL_REDUCE = True +@dataclass +class GraphCaptureContext: + stream: torch.cuda.Stream -# Tensor model parallel group that the current rank belongs to. -_TP_DEVICE_GROUP: Optional[ProcessGroup] = None -_TP_CPU_GROUP: Optional[ProcessGroup] = None -_TP_PYNCCL_COMMUNICATOR = None -_TP_CA_COMMUNICATOR = None -# Pipeline model parallel group that the current rank belongs to. -_PP_DEVICE_GROUP: Optional[ProcessGroup] = None -_PP_CPU_GROUP: Optional[ProcessGroup] = None -_PP_PYNCCL_COMMUNICATOR = None - -# when people blindly call `torch.distributed.all_reduce` etc, -# it will use this group. It is initialized with the `backend` -# parameter of `init_distributed_environment` below. -# Essentially, this is `torch.distributed.group.WORLD`. -# We leave a line here to note that this is device-specific. -# Note that this variable is not safe to use, because when users -# call `init_distributed_environment` first, and then destroy -# the process group themselves, this variable will keep a reference to the -# destroyed process group, which is not useful. -_DEVICE_WORLD_GROUP = None - -# duing `init_distributed_environment`, we will also initialize a -# group with `gloo` backend, to allow direct coordination between -# processes through the CPU. -_CPU_WORLD_GROUP = None - -# In summary, after calling `init_distributed_environment`, we will -# always have two groups: one for device-specific (and is the default) -# and one for CPU. All processes will be part of both groups. - -# A list of global ranks for each pipeline group to ease calculation of the -# source rank when broadcasting from the first or last pipeline stage. -_PP_GLOBAL_RANKS: Optional[List[int]] = None - -_LOCAL_RANK = -1 +TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"]) -def set_custom_all_reduce(enable: bool): - global _ENABLE_CUSTOM_ALL_REDUCE - _ENABLE_CUSTOM_ALL_REDUCE = enable +def _split_tensor_dict( + tensor_dict: Dict[Any, Union[torch.Tensor, Any]] +) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]: + """Split the tensor dictionary into two parts: + 1. A list of (key, value) pairs. If the value is a tensor, it is replaced + by its metadata. + 2. A list of tensors. + """ + metadata_list = [] + tensor_list = [] + for key, value in tensor_dict.items(): + if isinstance(value, torch.Tensor): + # Note: we cannot use `value.device` here, + # because it contains not only the device type but also the device + # index (e.g. "cuda:0"). We only need the device type. + # receiving side will set the device index. + device = "cpu" if value.is_cpu else "cuda" + metadata_list.append( + (key, TensorMetadata(device, value.dtype, value.size()))) + tensor_list.append(value) + else: + metadata_list.append((key, value)) + return metadata_list, tensor_list -def get_pp_pynccl_communicator(): - global _PP_PYNCCL_COMMUNICATOR - return _PP_PYNCCL_COMMUNICATOR +class GroupCoordinator: + """ + PyTorch ProcessGroup wrapper for a group of processes. + PyTorch ProcessGroup is bound to one specific communication backend, + e.g. NCCL, Gloo, MPI, etc. + GroupCoordinator takes charge of all the communication operations among + the processes in the group. It can route the communication to + a specific implementation (e.g. switch allreduce implementation + based on the tensor size and cuda graph mode). + """ -def get_tp_pynccl_communicator(): - global _TP_PYNCCL_COMMUNICATOR - return _TP_PYNCCL_COMMUNICATOR + # available attributes: + rank: int # global rank + ranks: List[int] # global ranks in the group + world_size: int # size of the group + # difference between `local_rank` and `rank_in_group`: + # if we have a group of size 4 across two nodes: + # Process | Node | Rank | Local Rank | Rank in Group + # 0 | 0 | 0 | 0 | 0 + # 1 | 0 | 1 | 1 | 1 + # 2 | 1 | 2 | 0 | 2 + # 3 | 1 | 3 | 1 | 3 + local_rank: int # local rank used to assign devices + rank_in_group: int # rank inside the group + cpu_group: ProcessGroup # group for CPU communication + device_group: ProcessGroup # group for device communication + use_pynccl: bool # a hint of whether to use PyNccl + use_custom_allreduce: bool # a hint of whether to use CustomAllreduce + # communicators are only created for world size > 1 + pynccl_comm: Optional[Any] # PyNccl communicator + ca_comm: Optional[Any] # Custom allreduce communicator + + def __init__( + self, + group_ranks: List[List[int]], + local_rank: int, + torch_distributed_backend: Union[str, Backend], + use_pynccl: bool, + use_custom_allreduce: bool, + ): + + self.rank = torch.distributed.get_rank() + self.local_rank = local_rank + self.device_group = None + self.cpu_group = None + + for ranks in group_ranks: + device_group = torch.distributed.new_group( + ranks, backend=torch_distributed_backend) + # a group with `gloo` backend, to allow direct coordination between + # processes through the CPU. + cpu_group = torch.distributed.new_group(ranks, backend="gloo") + if self.rank in ranks: + self.ranks = ranks + self.world_size = len(ranks) + self.rank_in_group = ranks.index(self.rank) + self.device_group = device_group + self.cpu_group = cpu_group + + assert self.cpu_group is not None + assert self.device_group is not None + if torch.cuda.is_available(): + self.device = torch.device(f"cuda:{local_rank}") + else: + self.device = torch.device("cpu") -def get_tp_ca_communicator(): - global _TP_CA_COMMUNICATOR - return _TP_CA_COMMUNICATOR + self.use_pynccl = use_pynccl + self.use_custom_allreduce = use_custom_allreduce + + # lazy import to avoid documentation build error + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) + from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator) + + self.pynccl_comm: Optional[PyNcclCommunicator] + if use_pynccl and self.world_size > 1: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, + device=self.device, + ) + else: + self.pynccl_comm = None + + self.ca_comm: Optional[CustomAllreduce] + if use_custom_allreduce and self.world_size > 1: + # Initialize a custom fast all-reduce implementation. + self.ca_comm = CustomAllreduce( + group=self.cpu_group, + device=self.device, + ) + else: + self.ca_comm = None + + @property + def first_rank(self): + """Return the global rank of the first process in the group""" + return self.ranks[0] + + @property + def last_rank(self): + """Return the global rank of the last process in the group""" + return self.ranks[-1] + + @property + def next_rank(self): + """Return the global rank of the process that follows the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group + 1) % world_size] + + @property + def prev_rank(self): + """Return the global rank of the process that precedes the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group - 1) % world_size] + + @contextmanager + def graph_capture( + self, graph_capture_context: Optional[GraphCaptureContext] = None): + if graph_capture_context is None: + stream = torch.cuda.Stream() + graph_capture_context = GraphCaptureContext(stream) + else: + stream = graph_capture_context.stream + + ca_comm = self.ca_comm + maybe_ca_context = nullcontext( + ) if ca_comm is None else ca_comm.capture() + with torch.cuda.stream(stream), maybe_ca_context: + # In graph mode, we have to be very careful about the collective + # operations. The current status is: + # allreduce \ Mode | Eager | Graph | + # -------------------------------------------- + # custom allreduce | enabled | enabled | + # PyNccl | disabled| enabled | + # torch.distributed | enabled | disabled| + # + # Note that custom allreduce will have a runtime check, if the + # tensor size is too large, it will fallback to the next + # available option. + # In summary: When using CUDA graph, we use + # either custom all-reduce kernel or pynccl. When not using + # CUDA graph, we use either custom all-reduce kernel or + # PyTorch NCCL. We always prioritize using custom all-reduce + # kernel but fall back to PyTorch or pynccl if it is + # disabled or not supported. + pynccl_comm = self.pynccl_comm + maybe_pynccl_context: Any + if not pynccl_comm: + maybe_pynccl_context = nullcontext() + else: + maybe_pynccl_context = pynccl_comm.change_state( + enable=True, stream=torch.cuda.current_stream()) + with maybe_pynccl_context: + yield graph_capture_context + + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: + """ + NOTE: This operation will be applied in-place or out-of-place. + Always assume this function modifies its input, but use the return + value as the output. + """ + ca_comm = self.ca_comm + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + if ca_comm is not None: + out = ca_comm.custom_all_reduce(input_) + if out is not None: + return out + pynccl_comm = self.pynccl_comm + if (pynccl_comm is not None and not pynccl_comm.disabled): + pynccl_comm.all_reduce(input_) + else: + torch.distributed.all_reduce(input_, group=self.device_group) + return input_ + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + # Allocate output tensor. + output_tensor = torch.empty((world_size, ) + input_size, + dtype=input_.dtype, + device=input_.device) + # All-gather. + torch.distributed.all_gather_into_tensor(output_tensor, + input_, + group=self.device_group) + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (world_size * + input_size[dim], ) + + input_size[dim + 1:]) + return output_tensor + + def gather(self, + input_: torch.Tensor, + dst: int = 0, + dim: int = -1) -> torch.Tensor: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + # Allocate output tensor. + if self.rank_in_group == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + # Gather. + torch.distributed.gather(input_, + gather_list, + dst=self.ranks[dst], + group=self.device_group) + if self.rank_in_group == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + def broadcast(self, input_: torch.Tensor, src: int = 0): + """Broadcast the input tensor. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + # Broadcast. + torch.distributed.broadcast(input_, + src=self.ranks[src], + group=self.device_group) + return input_ + + def broadcast_object_list(self, + obj_list: List[Any], + src: int = 0, + group: Optional[ProcessGroup] = None): + """Broadcast the input object list. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj_list + # Broadcast. + torch.distributed.broadcast_object_list(obj_list, + src=self.ranks[src], + group=self.device_group) + return obj_list + + def broadcast_tensor_dict( + self, + tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, + src: int = 0, + group: Optional[ProcessGroup] = None, + metadata_group: Optional[ProcessGroup] = None + ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: + """Broadcast the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if (not torch.distributed.is_initialized() or self.world_size == 1): + return tensor_dict + + group = self.device_group + metadata_group = self.cpu_group + assert src < self.world_size, f"Invalid src rank ({src})" + src = self.ranks[src] + + rank = self.rank + if rank == src: + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, + dict), (f"Expecting a dictionary, got {type(tensor_dict)}") + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `broadcast_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + torch.distributed.broadcast_object_list([metadata_list], + src=src, + group=metadata_group) + async_handles = [] + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast(tensor, + src=src, + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast(tensor, + src=src, + group=group, + async_op=True) + async_handles.append(handle) + for async_handle in async_handles: + async_handle.wait() + + else: + recv_metadata_list = [None] + torch.distributed.broadcast_object_list(recv_metadata_list, + src=src, + group=metadata_group) + assert recv_metadata_list[0] is not None + tensor_dict = {} + async_handles = [] + for key, value in recv_metadata_list[0]: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, + src=src, + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast(tensor, + src=src, + group=group, + async_op=True) + async_handles.append(handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict + + def barrier(self): + """Barrier synchronization among the group. + NOTE: don't use `device_group` here! `barrier` in NCCL is + terrible because it is internally a broadcast operation with + secretly created GPU tensors. It is easy to mess up the current + device. Use the CPU group instead. + """ + torch.distributed.barrier(group=self.cpu_group) + + def destroy(self): + if self.device_group is not None: + torch.distributed.destroy_process_group(self.device_group) + self.device_group = None + if self.cpu_group is not None: + torch.distributed.destroy_process_group(self.cpu_group) + self.cpu_group = None + if self.pynccl_comm is not None: + self.pynccl_comm = None + if self.ca_comm is not None: + self.ca_comm = None + + +_WORLD: Optional[GroupCoordinator] = None + + +def get_world_group() -> GroupCoordinator: + assert _WORLD is not None, ("world group is not initialized") + return _WORLD + + +_TP: Optional[GroupCoordinator] = None + + +def get_tp_group() -> GroupCoordinator: + assert _TP is not None, ("tensor model parallel group is not initialized") + return _TP + + +# kept for backward compatibility +get_tensor_model_parallel_group = get_tp_group + +_PP: Optional[GroupCoordinator] = None + + +def get_pp_group() -> GroupCoordinator: + assert _PP is not None, ( + "pipeline model parallel group is not initialized") + return _PP -def get_local_rank(): - global _LOCAL_RANK - return _LOCAL_RANK +# kept for backward compatibility +get_pipeline_model_parallel_group = get_pp_group + + +@contextmanager +def graph_capture(): + """ + `graph_capture` is a context manager which should surround the code that + is capturing the CUDA graph. Its main purpose is to ensure that the + some operations will be run after the graph is captured, before the graph + is replayed. It returns a `GraphCaptureContext` object which contains the + necessary data for the graph capture. Currently, it only contains the + stream that the graph capture is running on. This stream is set to the + current CUDA stream when the context manager is entered and reset to the + default stream when the context manager is exited. This is to ensure that + the graph capture is running on a separate stream from the default stream, + in order to explicitly distinguish the kernels to capture + from other kernels possibly launched on background in the default stream. + """ + with get_tp_group().graph_capture() as context, get_pp_group( + ).graph_capture(context): + yield context + + +logger = init_logger(__name__) + +_ENABLE_CUSTOM_ALL_REDUCE = True + + +def set_custom_all_reduce(enable: bool): + global _ENABLE_CUSTOM_ALL_REDUCE + _ENABLE_CUSTOM_ALL_REDUCE = enable def init_distributed_environment( @@ -100,31 +537,29 @@ def init_distributed_environment( init_method=distributed_init_method, world_size=world_size, rank=rank) - global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP - _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD + # set the local rank + # local_rank is not available in torch ProcessGroup, + # see https://github.com/pytorch/pytorch/issues/122816 + if local_rank == -1: + # local rank not set, this usually happens in single-node + # setting, where we can use rank as local rank + if distributed_init_method == "env://": + local_rank = envs.LOCAL_RANK + else: + local_rank = rank + global _WORLD + if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) - _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks, - backend="gloo") - # set the local rank - # local_rank is not available in torch ProcessGroup, - # see https://github.com/pytorch/pytorch/issues/122816 - if local_rank == -1: - # local rank not set, this usually happens in single-node - # setting, where we can use rank as local rank - if distributed_init_method == "env://": - local_rank = envs.LOCAL_RANK - else: - local_rank = rank - global _LOCAL_RANK - _LOCAL_RANK = local_rank - # A small all_reduce for warmup. - data = torch.zeros(1) - if torch.cuda.is_available(): - data = data.to(device=f"cuda:{local_rank}") - torch.distributed.all_reduce(data) - if torch.cuda.is_available(): - torch.cuda.synchronize() - del data + _WORLD = GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + else: + assert _WORLD.world_size == torch.distributed.get_world_size(), ( + "world group already initialized with a different world size") def initialize_model_parallel( @@ -157,8 +592,8 @@ def initialize_model_parallel( # Get world size and rank. Ensure some consistencies. assert torch.distributed.is_initialized() world_size: int = torch.distributed.get_world_size() - # get the backend of _DEVICE_WORLD_GROUP - backend = backend or torch.distributed.get_backend() + backend = backend or torch.distributed.get_backend( + get_world_group().device_group) if (world_size != tensor_model_parallel_size * pipeline_model_parallel_size): @@ -167,63 +602,42 @@ def initialize_model_parallel( f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " f"pipeline_model_parallel_size ({pipeline_model_parallel_size})") + # Build the tensor model-parallel groups. num_tensor_model_parallel_groups: int = (world_size // tensor_model_parallel_size) - num_pipeline_model_parallel_groups: int = (world_size // - pipeline_model_parallel_size) - rank = torch.distributed.get_rank() - - # Build the tensor model-parallel groups. - global _TP_DEVICE_GROUP, _TP_CPU_GROUP - global _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR - assert _TP_DEVICE_GROUP is None, ( - "tensor model parallel group is already initialized") + global _TP + assert _TP is None, ("tensor model parallel group is already initialized") + group_ranks = [] for i in range(num_tensor_model_parallel_groups): ranks = list( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) - group = torch.distributed.new_group(ranks, backend=backend) - cpu_group = torch.distributed.new_group(ranks, backend="gloo") - if rank in ranks: - _TP_DEVICE_GROUP = group - _TP_CPU_GROUP = cpu_group - - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - if tensor_model_parallel_size > 1: - _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( - group=_TP_CPU_GROUP, - device=_LOCAL_RANK, - ) - - # Initialize a custom fast all-reduce implementation. - if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce import ( - CustomAllreduce) - _TP_CA_COMMUNICATOR = CustomAllreduce( - group=_TP_CPU_GROUP, - device=_LOCAL_RANK, - ) + group_ranks.append(ranks) + _TP = GroupCoordinator( + group_ranks=group_ranks, + local_rank=get_world_group().local_rank, + torch_distributed_backend=backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) # Build the pipeline model-parallel groups. - global _PP_DEVICE_GROUP, _PP_CPU_GROUP - global _PP_PYNCCL_COMMUNICATOR - global _PP_GLOBAL_RANKS - assert _PP_DEVICE_GROUP is None, ( + num_pipeline_model_parallel_groups: int = (world_size // + pipeline_model_parallel_size) + global _PP + assert _PP is None, ( "pipeline model parallel group is already initialized") + group_ranks = [] for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) - group = torch.distributed.new_group(ranks, backend=backend) - cpu_group = torch.distributed.new_group(ranks, backend="gloo") - if rank in ranks: - _PP_DEVICE_GROUP = group - _PP_CPU_GROUP = cpu_group - _PP_GLOBAL_RANKS = ranks - - if pipeline_model_parallel_size > 1: - _PP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( - group=_PP_CPU_GROUP, - device=_LOCAL_RANK, - ) + group_ranks.append(ranks) + _PP = GroupCoordinator( + group_ranks=group_ranks, + local_rank=get_world_group().local_rank, + torch_distributed_backend=backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) def ensure_model_parallel_initialized( @@ -235,8 +649,8 @@ def ensure_model_parallel_initialized( or ensure tensor-parallel and pipeline-parallel sizes are equal to expected values if the model parallel groups are initialized. """ - # get the backend of _DEVICE_WORLD_GROUP - backend = backend or torch.distributed.get_backend() + backend = backend or torch.distributed.get_backend( + get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend) @@ -247,137 +661,48 @@ def ensure_model_parallel_initialized( ), ("tensor parallel group already initialized, but of unexpected size: " f"{get_tensor_model_parallel_world_size()=} vs. " f"{tensor_model_parallel_size=}") - assert (get_pipeline_model_parallel_world_size( - ) == pipeline_model_parallel_size), ( + pp_world_size = get_pp_group().world_size + assert (pp_world_size == pipeline_model_parallel_size), ( "pipeline parallel group already initialized, but of unexpected size: " - f"{get_pipeline_model_parallel_world_size()=} vs. " + f"{pp_world_size=} vs. " f"{pipeline_model_parallel_size=}") def model_parallel_is_initialized(): """Check if tensor and pipeline parallel groups are initialized.""" - return (_TP_DEVICE_GROUP is not None and _PP_DEVICE_GROUP is not None) - - -def get_cpu_world_group(): - """Get the CPU world group.""" - assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized") - return _CPU_WORLD_GROUP - - -def get_tensor_model_parallel_group(): - """Get the tensor model parallel group the caller rank belongs to.""" - assert _TP_DEVICE_GROUP is not None, ( - "tensor model parallel group is not initialized") - return _TP_DEVICE_GROUP - - -def get_tensor_model_parallel_cpu_group(): - """Get the tensor model parallel cpu group the caller rank belongs to.""" - assert _TP_CPU_GROUP is not None, ( - "tensor model parallel cpu group is not initialized") - return _TP_CPU_GROUP - - -def get_pipeline_model_parallel_group(): - """Get the pipeline model parallel group the caller rank belongs to.""" - assert _PP_DEVICE_GROUP is not None, ( - "pipeline model parallel group is not initialized") - return _PP_DEVICE_GROUP - - -def get_pipeline_model_parallel_cpu_group(): - """Get the pipeline model parallel cpu group the caller rank belongs to.""" - assert _PP_CPU_GROUP is not None, ( - "pipeline model parallel cpu group is not initialized") - return _PP_CPU_GROUP + return (_TP is not None and _PP is not None) def get_tensor_model_parallel_world_size(): """Return world size for the tensor model parallel group.""" - return torch.distributed.get_world_size( - group=get_tensor_model_parallel_group()) - - -def get_pipeline_model_parallel_world_size(): - """Return world size for the pipeline model parallel group.""" - return torch.distributed.get_world_size( - group=get_pipeline_model_parallel_group()) + return get_tp_group().world_size def get_tensor_model_parallel_rank(): """Return my rank for the tensor model parallel group.""" - return torch.distributed.get_rank(group=get_tensor_model_parallel_group()) - - -def get_pipeline_model_parallel_rank(): - """Return my rank for the pipeline model parallel group.""" - return torch.distributed.get_rank( - group=get_pipeline_model_parallel_group()) - - -def get_tensor_model_parallel_src_rank(): - """Calculate the global rank corresponding to the first local rank - in the tensor model parallel group.""" - global_rank = torch.distributed.get_rank() - local_world_size = get_tensor_model_parallel_world_size() - return (global_rank // local_world_size) * local_world_size - - -def get_pipeline_model_parallel_first_rank(): - """Return the global rank of the first process in the pipeline for the - current tensor parallel group""" - assert _PP_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - return _PP_GLOBAL_RANKS[0] - - -def get_pipeline_model_parallel_last_rank(): - """Return the global rank of the last process in the pipeline for the - current tensor parallel group""" - assert _PP_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - last_rank_local = get_pipeline_model_parallel_world_size() - 1 - return _PP_GLOBAL_RANKS[last_rank_local] - - -def get_pipeline_model_parallel_next_rank(): - """Return the global rank that follows the caller in the pipeline""" - assert _PP_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - rank_in_pipeline = get_pipeline_model_parallel_rank() - world_size = get_pipeline_model_parallel_world_size() - return _PP_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size] - - -def get_pipeline_model_parallel_prev_rank(): - """Return the global rank that precedes the caller in the pipeline""" - assert _PP_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - rank_in_pipeline = get_pipeline_model_parallel_rank() - world_size = get_pipeline_model_parallel_world_size() - return _PP_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size] + return get_tp_group().rank_in_group def destroy_model_parallel(): """Set the groups to none and destroy them.""" - global _TP_DEVICE_GROUP - if _TP_DEVICE_GROUP: - torch.distributed.destroy_process_group(_TP_DEVICE_GROUP) - _TP_DEVICE_GROUP = None - global _TP_CPU_GROUP - if _TP_CPU_GROUP: - torch.distributed.destroy_process_group(_TP_CPU_GROUP) - _TP_CPU_GROUP = None - global _TP_PYNCCL_COMMUNICATOR - _TP_PYNCCL_COMMUNICATOR = None - - global _PP_DEVICE_GROUP - if _PP_DEVICE_GROUP: - torch.distributed.destroy_process_group(_PP_DEVICE_GROUP) - _PP_DEVICE_GROUP = None - global _PP_GLOBAL_RANKS - _PP_GLOBAL_RANKS = None + global _TP + if _TP: + _TP.destroy() + _TP = None + + global _PP + if _PP: + _PP.destroy() + _PP = None + + +def destroy_distributed_environment(): + global _WORLD + if _WORLD: + _WORLD.destroy() + _WORLD = None + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() def is_in_the_same_node(pg: ProcessGroup): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index de616ef1d..476e9ba3b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -13,7 +13,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.communication_op import graph_capture +from vllm.distributed.parallel_state import graph_capture from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest -- GitLab From 916d219d62e9e4005e10be23f81d881afdb8d6d0 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 12 Jun 2024 17:58:12 -0700 Subject: [PATCH 016/376] [ci] Use sccache to build images (#5419) Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 2 +- Dockerfile | 22 ++++++++++++++++++++-- setup.py | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 645747ddd..09649b625 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -7,7 +7,7 @@ steps: queue: cpu_queue commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." + - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - "docker push {{ docker_image }}" env: DOCKER_BUILDKIT: "1" diff --git a/Dockerfile b/Dockerfile index eb96bf3c1..62c401069 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev RUN apt-get update -y \ - && apt-get install -y python3-pip git + && apt-get install -y python3-pip git curl sudo # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -70,10 +70,28 @@ ENV NVCC_THREADS=$nvcc_threads # make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 +ARG USE_SCCACHE +# if USE_SCCACHE is set, use sccache to speed up compilation +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$USE_SCCACHE" = "1" ]; then \ + echo "Installing sccache..." \ + && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ + && tar -xzf sccache.tar.gz \ + && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ + && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ + && export SCCACHE_BUCKET=vllm-build-sccache \ + && export SCCACHE_REGION=us-west-2 \ + && sccache --show-stats \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && sccache --show-stats; \ + fi + ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ - python3 setup.py bdist_wheel --dist-dir=dist + if [ "$USE_SCCACHE" != "1" ]; then \ + python3 setup.py bdist_wheel --dist-dir=dist; \ + fi # check the size of the wheel, we cannot upload wheels larger than 100MB COPY .buildkite/check-wheel-size.py check-wheel-size.py diff --git a/setup.py b/setup.py index 12e5c3456..3a41b1a0b 100644 --- a/setup.py +++ b/setup.py @@ -140,6 +140,7 @@ class cmake_build_ext(build_ext): cmake_args += [ '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', + '-DCMAKE_C_COMPILER_LAUNCHER=sccache', ] elif is_ccache_available(): cmake_args += [ @@ -171,7 +172,6 @@ class cmake_build_ext(build_ext): else: # Default build tool to whatever cmake picks. build_tool = [] - subprocess.check_call( ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], cwd=self.build_temp) -- GitLab From 88407532e7ec2dd3313f6cb3a31d8dd1fa868178 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 13 Jun 2024 11:16:41 +0800 Subject: [PATCH 017/376] =?UTF-8?q?[Bugfix]if=20the=20content=20is=20start?= =?UTF-8?q?ed=20with=20":"(response=20of=20ping),=20client=20should=20i?= =?UTF-8?q?=E2=80=A6=20(#5303)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Wang, Yi A Co-authored-by: Roger Wang --- benchmarks/backend_request_func.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 58dcc6167..52386b8cd 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -68,9 +68,13 @@ async def async_request_tgi( chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue + chunk_bytes = chunk_bytes.decode("utf-8") - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + #NOTE: Sometimes TGI returns a ping response without + # any data, we should skip it. + if chunk_bytes.startswith(":"): + continue + chunk = remove_prefix(chunk_bytes, "data:") data = json.loads(chunk) timestamp = time.perf_counter() -- GitLab From c2637a613b6140dc16fecd5a1b0f5a9e1d0932ff Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 13 Jun 2024 10:19:56 -0400 Subject: [PATCH 018/376] [Kernel] `w4a16` support for `compressed-tensors` (#5385) Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- tests/quantization/test_compressed_tensors.py | 27 ++- .../compressed_tensors/compressed_tensors.py | 44 ++++- .../compressed_tensors/schemes/__init__.py | 1 + .../schemes/compressed_tensors_w4a16.py | 168 ++++++++++++++++++ 4 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index e6d8218b4..5670498f2 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -3,12 +3,13 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. """ +import pytest import torch from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + CompressedTensorsLinearMethod, CompressedTensorsW4A16, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) def test_compressed_tensors_w8a8_static_setup(vllm_runner): @@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) assert qkv_proj.weight.dtype is torch.int8 + + +@pytest.mark.parametrize("w4a16_args", [ + ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None), + ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128), +]) +def test_compressed_tensors_w4a16(vllm_runner, w4a16_args): + model, strategy, group = w4a16_args + with vllm_runner(model) as llm: + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16) + + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == group + + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.weight_packed.pack_factor == 8 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index d2b0ce0db..c7f047845 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + CompressedTensorsScheme, CompressedTensorsW4A16, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) @@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig): layer_quant_details: Dict[str, Any] = dict() ignore: List[str] = config.get("ignore", None) + # The quant_config has multiple config_groups, each containing + # an input_activations key with details about how the activations are + # quantized, a weights key indicating how the weights are quantized, + # and a list of targets under the `targets` key, dictating which + # layers are impacted by the quantization details. The quantization + # details follow the structure defined by the QuantizationArgs + # pydantic model, which is used to verify the structure of the + # quant_config and also store the details for later use. for key, quant_config in config["config_groups"].items(): targets = quant_config.get("targets") for target in targets: layer_quant_details[target] = {} layer_quant_details[target][ - "weight"] = QuantizationArgs.parse_obj( + "weights"] = QuantizationArgs.parse_obj( quant_config.get("weights")) - layer_quant_details[target][ - "input"] = QuantizationArgs.parse_obj( - quant_config.get("input_activations")) + try: + layer_quant_details[target][ + "input_activations"] = QuantizationArgs.parse_obj( + quant_config.get("input_activations")) + except Exception: + layer_quant_details[target]["input_activations"] = None return cls(layer_quant_details=layer_quant_details, ignore=ignore) @@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig): return is_8_bits and is_token_tensor and is_symmetric and is_dynamic + def _is_w4a16(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: + input_quant_none = input_quant is None + is_4_bits = weight_quant.num_bits == 4 + is_symmetric = weight_quant.symmetric + is_static = not weight_quant.dynamic + + return is_4_bits and input_quant_none and is_symmetric and is_static + def _get_schema(self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": + + if self._is_w4a16(weight_quant, input_quant): + return CompressedTensorsW4A16(num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + group_size=weight_quant.group_size) + if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8StaticTensor() @@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig): raise ValueError( f"Could not find quantization details for {layer}.") - return self._get_schema(weight_quant=layer_quant_details["weight"], - input_quant=layer_quant_details["input"]) + return self._get_schema( + weight_quant=layer_quant_details["weights"], + input_quant=layer_quant_details["input_activations"]) class CompressedTensorsLinearMethod(LinearMethodBase): @@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase): layer=layer, input_size_per_partition=input_size_per_partition, output_partition_sizes=output_partition_sizes, + input_size=input_size, output_size=output_size, params_dtype=params_dtype, weight_loader=weight_loader) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 9a910f061..dc84d0008 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,6 +1,7 @@ from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 from .compressed_tensors_unquantized import ( # noqa: F401 CompressedTensorsUnquantized) +from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401 from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 CompressedTensorsW8A8DynamicToken) from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py new file mode 100644 index 000000000..90446a5ff --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py @@ -0,0 +1,168 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState, + marlin_permute_scales) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW4A16"] + + +class CompressedTensorsW4A16(CompressedTensorsScheme): + + def __init__(self, + strategy: str, + num_bits: int, + group_size: Optional[int] = None): + self.num_bits = num_bits + self.strategy = strategy + self.group_size = group_size + + if self.strategy == "group" and self.group_size is None: + raise ValueError( + "group_size must be given when using strategy group") + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + pack_factor = 32 // self.num_bits + output_size_per_partition = sum(output_partition_sizes) + + if self.group_size is not None: + group_size = self.group_size + else: + group_size = input_size + + weight_scale_dim = None + scales_and_zp_size = input_size // group_size + + if (input_size != input_size_per_partition + and self.group_size is not None): + weight_scale_dim = 1 + scales_and_zp_size = input_size_per_partition // group_size + + weight = Parameter( + torch.empty( + output_size_per_partition, + input_size_per_partition // pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + "packed_dim": 1, + "pack_factor": pack_factor + }) + set_weight_attrs(weight, {"weight_loader": weight_loader}) + + layer.register_parameter("weight_packed", weight) + + weight_scale = Parameter( + torch.empty( + output_size_per_partition, + scales_and_zp_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + + set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + set_weight_attrs(weight_scale, { + "input_dim": weight_scale_dim, + "output_dim": 0 + }) + layer.register_parameter("weight_scale", weight_scale) + + # A 2D array defining the original shape of the weights + # before packing + weight_shape = Parameter(torch.empty(2, dtype=torch.int64), + requires_grad=False) + + layer.register_parameter("weight_shape", weight_shape) + set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + layer.input_size = input_size + layer.marlin_state = GPTQMarlinState.REPACK + layer.is_k_full = True + layer.group_size = group_size + + max_workspace_size = ( + output_size_per_partition // + GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL + + workspace = torch.zeros(max_workspace_size, + dtype=torch.int, + requires_grad=False) + layer.workspace = workspace + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + reshaped_x = x.reshape(-1, x.shape[-1]) + + size_m = reshaped_x.shape[0] + part_size_n = layer.output_size_per_partition + part_size_k = layer.input_size_per_partition + + out_shape = x.shape[:-1] + (part_size_n, ) + + if layer.marlin_state == GPTQMarlinState.REPACK: + layer.marlin_state = GPTQMarlinState.READY + + # Newly generated tensors need to replace existing tensors that are + # already registered as parameters by vLLM (and won't be freed) + def replace_tensor(name, new_t): + # It is important to use resize_() here since it ensures + # the same buffer is reused + getattr(layer, name).resize_(new_t.shape) + getattr(layer, name).copy_(new_t) + del new_t + + cur_device = layer.weight_packed.device + + # Reset g_idx related tensors + layer.g_idx = Parameter(torch.empty(0, + dtype=torch.int, + device=cur_device), + requires_grad=False) + layer.g_idx_sort_indices = Parameter(torch.empty( + 0, dtype=torch.int, device=cur_device), + requires_grad=False) + + # Repack weights + marlin_qweight = ops.gptq_marlin_repack( + layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices, + part_size_k, part_size_n, self.num_bits) + + replace_tensor("weight_packed", marlin_qweight) + + # Permute scales + scales_size_k = part_size_k + scales_size_n = part_size_n + + marlin_scales = marlin_permute_scales( + layer.weight_scale.squeeze().t().contiguous(), scales_size_k, + scales_size_n, layer.group_size, self.num_bits) + replace_tensor("weight_scale", marlin_scales) + + output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed, + layer.weight_scale, layer.g_idx, + layer.g_idx_sort_indices, + layer.workspace, self.num_bits, size_m, + part_size_n, part_size_k, + layer.is_k_full) + return output.reshape(out_shape) -- GitLab From 23ec72fa032b3d81a5ea9eb0f7c607f1d6e7949a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 13 Jun 2024 11:18:08 -0400 Subject: [PATCH 019/376] [CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) --- tests/models/test_aqlm.py | 13 ++----------- tests/models/test_fp8.py | 12 ++---------- tests/models/test_gptq_marlin.py | 13 ++----------- tests/models/test_gptq_marlin_24.py | 13 ++----------- tests/models/test_marlin.py | 13 ++----------- tests/quantization/test_bitsandbytes.py | 10 +++------- tests/quantization/test_fp8.py | 15 +++++---------- tests/quantization/utils.py | 14 ++++++++++++++ 8 files changed, 32 insertions(+), 71 deletions(-) create mode 100644 tests/quantization/utils.py diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index c4ecf846e..80034a511 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`. """ import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - -aqlm_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - aqlm_not_supported = (capability < - QUANTIZATION_METHODS["aqlm"].get_min_capability()) +from tests.quantization.utils import is_quant_method_supported # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency @@ -67,7 +58,7 @@ ground_truth_generations = [ ] -@pytest.mark.skipif(aqlm_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("aqlm"), reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 61aee0d0a..b24c17cf3 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -8,8 +8,8 @@ import pytest import torch from transformers import AutoTokenizer +from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -67,16 +67,8 @@ EXPECTED_STRS_MAP = { }, } -fp8_not_supported = True -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - fp8_not_supported = (capability < - QUANTIZATION_METHODS["fp8"].get_min_capability()) - - -@pytest.mark.skipif(fp8_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index e957450cc..e30100d9b 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`. import os import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from .utils import check_logprobs_close @@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 -gptq_marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - gptq_marlin_not_supported = ( - capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) - MODELS = [ # act_order==False, group_size=channelwise ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), @@ -53,7 +44,7 @@ MODELS = [ @pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(gptq_marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "bfloat16"]) diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 195c3e5b5..60d9ae2f1 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`. from dataclasses import dataclass import pytest -import torch from tests.models.utils import check_logprobs_close -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - -marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - marlin_not_supported = ( - capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) +from tests.quantization.utils import is_quant_method_supported @dataclass @@ -47,7 +38,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), reason="Marlin24 is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 761ba6aa4..e86f6e29d 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`. from dataclasses import dataclass import pytest -import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from tests.quantization.utils import is_quant_method_supported from .utils import check_logprobs_close -marlin_not_supported = True - -if torch.cuda.is_available(): - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - marlin_not_supported = ( - capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) - @dataclass class ModelPair: @@ -45,7 +36,7 @@ model_pairs = [ @pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(marlin_not_supported, +@pytest.mark.skipif(not is_quant_method_supported("marlin"), reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 31e938d15..953fd9ba9 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. import pytest import torch +from tests.quantization.utils import is_quant_method_supported from vllm import SamplingParams -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] - -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), - reason='bitsandbytes is not supported on this GPU type.') +@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), + reason='bitsandbytes is not supported on this GPU type.') def test_load_bnb_model(vllm_runner) -> None: with vllm_runner('huggyllama/llama-7b', quantization='bitsandbytes', diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7cb65326c..74d21ead0 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,17 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`. import pytest import torch +from tests.quantization.utils import is_quant_method_supported from vllm._custom_ops import scaled_fp8_quant -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] - -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), - reason="FP8 is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("fp8"), + reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: @@ -25,9 +21,8 @@ def test_load_fp16_model(vllm_runner) -> None: assert fc1.weight.dtype == torch.float8_e4m3fn -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), - reason="FP8 is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("fp8"), + reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_scaled_fp8_quant(dtype) -> None: diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py new file mode 100644 index 000000000..0c92d565d --- /dev/null +++ b/tests/quantization/utils.py @@ -0,0 +1,14 @@ +import torch + +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + + +def is_quant_method_supported(quant_method: str) -> bool: + # Currently, all quantization methods require Nvidia or AMD GPUs + if not torch.cuda.is_available(): + return False + + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + return (capability < + QUANTIZATION_METHODS[quant_method].get_min_capability()) -- GitLab From bd43973522ea17be50e10fbb222a22f673c8067e Mon Sep 17 00:00:00 2001 From: wenyujin333 Date: Fri, 14 Jun 2024 00:01:10 +0800 Subject: [PATCH 020/376] [Kernel] Tune Qwen2MoE kernel configurations with tp2,4 (#5497) Tune Qwen2-57B-A14B configs based on #4921 Throughput Performance command: python benchmarks/benchmark_throughput.py --model=Qwen/Qwen2-57B-A14B-Instruct --input-len 1000 --output-len 50 -tp 2 A100 GPU benchmark no config w/ PR tp=2 10.53 requests/s, 11058.17 tokens/s 12.47 requests/s, 13088.57 tokens/s tp=4 17.77 requests/s, 18662.95 tokens/s 20.20 requests/s, 21212.32 tokens/s --- ...280,device_name=NVIDIA_A100-SXM4-80GB.json | 146 ++++++++++++++++++ ...280,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...640,device_name=NVIDIA_A100-SXM4-80GB.json | 146 ++++++++++++++++++ ...640,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 000000000..8cc6c643f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000..d4c9ddd12 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 000000000..b2799ed3a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000..b8d3be231 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} -- GitLab From 80aa7e91fcd547a7a1396f71b9bdce18e5c92245 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 14 Jun 2024 00:33:14 +0800 Subject: [PATCH 021/376] [Hardware][Intel] Optimize CPU backend and add more performance tips (#4971) Co-authored-by: Jianan Gu --- Dockerfile.cpu | 8 +- README.md | 2 +- .../getting_started/cpu-installation.rst | 23 +++- requirements-cpu.txt | 2 +- vllm/attention/backends/torch_sdpa.py | 23 +++- vllm/attention/ops/ipex_attn.py | 120 ++++++++++++++++++ 6 files changed, 165 insertions(+), 13 deletions(-) create mode 100644 vllm/attention/ops/ipex_attn.py diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 403a1cd03..777bb0829 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -3,9 +3,13 @@ FROM ubuntu:22.04 AS cpu-test-1 RUN apt-get update -y \ - && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ + && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc + +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl + RUN pip install --upgrade pip \ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy @@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks CMD ["/bin/bash"] diff --git a/README.md b/README.md index 57374d279..8e4480ac2 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs and AMD GPUs +- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs - (Experimental) Prefix caching support - (Experimental) Multi-lora support diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 5270253ca..a9544e8a5 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -10,6 +10,7 @@ Table of contents: #. :ref:`Requirements ` #. :ref:`Quick start using Dockerfile ` #. :ref:`Build from source ` +#. :ref:`Intel Extension for PyTorch ` #. :ref:`Performance tips ` .. _cpu_backend_requirements: @@ -18,7 +19,7 @@ Requirements ------------ * OS: Linux -* Compiler: gcc/g++>=12.3.0 (recommended) +* Compiler: gcc/g++>=12.3.0 (optional, recommended) * Instruction set architecture (ISA) requirement: AVX512 is required. .. _cpu_backend_quick_start_dockerfile: @@ -41,7 +42,7 @@ Quick start using Dockerfile Build from source ----------------- -- First, install required compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: .. code-block:: console @@ -70,6 +71,15 @@ Build from source - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +.. _ipex_guidance: + +Intel Extension for PyTorch +--------------------------- + +- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed. + .. _cpu_backend_performance_tips: Performance tips @@ -77,6 +87,15 @@ Performance tips - vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: + +.. code-block:: console + + $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library + $ find / -name *libtcmalloc* # find the dynamic link library path + $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD + $ python examples/offline_inference.py # run vLLM + - vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. - If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading. diff --git a/requirements-cpu.txt b/requirements-cpu.txt index b739642d8..8b7d86e68 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,5 +2,5 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.0+cpu +torch == 2.3.1+cpu triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 9b50adec5..4b08cce99 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -8,8 +8,16 @@ from torch.nn.functional import scaled_dot_product_attention from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) +from vllm.attention.ops.paged_attn import PagedAttentionMetadata +from vllm.utils import is_cpu + +if is_cpu(): + try: + from vllm.attention.ops.ipex_attn import PagedAttention + except ImportError: + from vllm.attention.ops.paged_attn import PagedAttention +else: + from vllm.attention.ops.paged_attn import PagedAttention class TorchSDPABackend(AttentionBackend): @@ -197,13 +205,14 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): attn_metadata.attn_bias): end = start + seq_len sub_out = scaled_dot_product_attention( - query[:, start:end, :], - key[:, start:end, :], - value[:, start:end, :], + query[None, :, start:end, :], + key[None, :, start:end, :], + value[None, :, start:end, :], attn_mask=mask, dropout_p=0.0, is_causal=not self.need_mask, - scale=self.scale).movedim(query.dim() - 2, 0) + scale=self.scale).squeeze(0).movedim( + query.dim() - 2, 0) output[start:end, :, :] = sub_out start = end else: @@ -248,7 +257,7 @@ def _make_alibi_bias( num_heads = alibi_slopes.shape[0] bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]) + bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) inf_mask = torch.empty( (1, seq_len, seq_len), dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py new file mode 100644 index 000000000..5a5317b65 --- /dev/null +++ b/vllm/attention/ops/ipex_attn.py @@ -0,0 +1,120 @@ +from typing import Dict, List, Optional, Tuple + +import intel_extension_for_pytorch.llm.modules as ipex_modules +import torch + +from vllm import _custom_ops as ops + + +class PagedAttention: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 128, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size * num_kv_heads * head_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[torch.Tensor, torch.Tensor]: + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + kv_scale: float, + *args, + ) -> None: + ipex_modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, + slot_mapping.flatten().int()) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + kv_scale: float, + *args, + ) -> torch.Tensor: + output = torch.empty_like(query) + block_size = value_cache.shape[2] + head_mapping = torch.arange( + 0, + num_kv_heads, + device="cpu", + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(query.size(1) // num_kv_heads).flatten() + ipex_modules.PagedAttention.single_query_cached_kv_attention( + output, query.contiguous(), key_cache, value_cache, head_mapping, + scale, block_tables, context_lens, block_size, max_context_len, + alibi_slopes) + + return output + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + subquery_start_loc: torch.Tensor, + prompt_lens_tensor: torch.Tensor, + context_lens: torch.Tensor, + max_subquery_len: int, + alibi_slopes: Optional[torch.Tensor], + *args, + ) -> torch.Tensor: + raise NotImplementedError + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + *args, + ) -> None: + raise NotImplementedError + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + *args, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) -- GitLab From a65634d3ae8928284b3923a46bff89731cb1792e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 13 Jun 2024 10:18:26 -0700 Subject: [PATCH 022/376] [Docs] Add 4th meetup slides (#5509) --- README.md | 8 +------- docs/source/community/meetups.rst | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8e4480ac2..ce2d8d5fd 100644 --- a/README.md +++ b/README.md @@ -23,16 +23,10 @@ If you have cool projects related to vLLM or LLM inference, we would love to see This will be a great chance for everyone in the community to get together and learn. Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite) -**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)** - -We are thrilled to announce our fourth vLLM Meetup! -The vLLM team will share recent updates and roadmap. -We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM. -Please register [here](https://lu.ma/agivllm) and join us! - --- *Latest News* 🔥 +- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). - [2024/01] Added ROCm 6.0 support to vLLM. diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst index f37119478..0fde31ef9 100644 --- a/docs/source/community/meetups.rst +++ b/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ - `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ - `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ - `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ -- GitLab From 03dccc886ef7e5d0dd67512f3e9748ee00c21fb2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Jun 2024 02:21:39 +0800 Subject: [PATCH 023/376] [Misc] Add vLLM version getter to utils (#5098) --- setup.py | 2 +- vllm/__init__.py | 3 ++- vllm/engine/llm_engine.py | 4 ++-- vllm/entrypoints/openai/api_server.py | 6 +++--- vllm/entrypoints/openai/run_batch.py | 4 ++-- vllm/usage/usage_lib.py | 4 ++-- vllm/version.py | 1 + 7 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 vllm/version.py diff --git a/setup.py b/setup.py index 3a41b1a0b..12a704e08 100644 --- a/setup.py +++ b/setup.py @@ -314,7 +314,7 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: - version = find_version(get_path("vllm", "__init__.py")) + version = find_version(get_path("vllm", "version.py")) if _is_cuda(): cuda_version = str(get_nvcc_cuda_version()) diff --git a/vllm/__init__.py b/vllm/__init__.py index 10cc66941..e21705987 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -12,9 +12,10 @@ from vllm.outputs import (CompletionOutput, EmbeddingOutput, from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -__version__ = "0.5.0" +from .version import __version__ __all__ = [ + "__version__", "LLM", "ModelRegistry", "PromptStrictInputs", diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ea7547584..b2f6478cb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -6,7 +6,6 @@ from typing import Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer -import vllm from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -38,6 +37,7 @@ from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter +from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -169,7 +169,7 @@ class LLMEngine: "enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, seed=%d, served_model_name=%s)", - vllm.__version__, + VLLM_VERSION, model_config.model, speculative_config, model_config.tokenizer, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e7503b965..ea6275920 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,6 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse from prometheus_client import make_asgi_app from starlette.routing import Mount -import vllm import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -29,6 +28,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext +from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -93,7 +93,7 @@ async def show_available_models(): @app.get("/version") async def show_version(): - ver = {"version": vllm.__version__} + ver = {"version": VLLM_VERSION} return JSONResponse(content=ver) @@ -174,7 +174,7 @@ if __name__ == "__main__": raise ValueError(f"Invalid middleware {middleware}. " f"Must be a function or a class.") - logger.info("vLLM API server version %s", vllm.__version__) + logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) if args.served_model_name is not None: diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 731f4f4a4..7a6819c35 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -5,7 +5,6 @@ from io import StringIO import aiohttp -import vllm from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, @@ -15,6 +14,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import random_uuid +from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -135,7 +135,7 @@ async def main(args): if __name__ == "__main__": args = parse_args() - logger.info("vLLM API server version %s", vllm.__version__) + logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) asyncio.run(main(args)) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 40a954a29..afb3007a5 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -16,6 +16,7 @@ import requests import torch import vllm.envs as envs +from vllm.version import __version__ as VLLM_VERSION _config_home = envs.VLLM_CONFIG_ROOT _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json") @@ -163,9 +164,8 @@ class UsageMessage: ]) # vLLM information - import vllm # delayed import to prevent circular import self.context = usage_context.value - self.vllm_version = vllm.__version__ + self.vllm_version = VLLM_VERSION self.model_architecture = model_architecture # Metadata diff --git a/vllm/version.py b/vllm/version.py new file mode 100644 index 000000000..3d187266f --- /dev/null +++ b/vllm/version.py @@ -0,0 +1 @@ +__version__ = "0.5.0" -- GitLab From 39873476f8a1cf97bdf5651b4535ae60358ff15b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Jun 2024 02:21:53 +0800 Subject: [PATCH 024/376] [CI/Build] Simplify OpenAI server setup in tests (#5100) --- tests/async_engine/test_openapi_server_ray.py | 31 ++- tests/entrypoints/test_openai_embedding.py | 113 ++++++++++ tests/entrypoints/test_openai_server.py | 206 ++++-------------- tests/entrypoints/test_openai_vision.py | 35 ++- tests/tensorizer_loader/test_tensorizer.py | 14 +- tests/utils.py | 124 ++++++++--- 6 files changed, 285 insertions(+), 238 deletions(-) create mode 100644 tests/entrypoints/test_openai_embedding.py diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index c25875bd1..cc05d79e5 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,16 +4,22 @@ import pytest # and debugging. import ray -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") -def server(): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(ray_ctx): + return RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -24,22 +30,15 @@ def server(): "--enforce-eager", "--engine-use-ray" ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + return server.get_async_client() @pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -48,7 +47,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_completion(server, client: openai.AsyncOpenAI): +async def test_single_completion(client: openai.AsyncOpenAI): completion = await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, @@ -72,7 +71,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_chat_session(server, client: openai.AsyncOpenAI): +async def test_single_chat_session(client: openai.AsyncOpenAI): messages = [{ "role": "system", "content": "you are a helpful assistant" diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py new file mode 100644 index 000000000..2496d2ac3 --- /dev/null +++ b/tests/entrypoints/test_openai_embedding.py @@ -0,0 +1,113 @@ +import openai +import pytest +import ray + +from ..utils import VLLM_PATH, RemoteOpenAIServer + +EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" + +pytestmark = pytest.mark.openai + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def embedding_server(ray_ctx): + return RemoteOpenAIServer([ + "--model", + EMBEDDING_MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--enforce-eager", + ]) + + +@pytest.mark.asyncio +@pytest.fixture(scope="module") +def embedding_client(embedding_server): + return embedding_server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_single_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "The chef prepared a delicious meal.", + ] + + # test single embedding + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 9 + assert embeddings.usage.total_tokens == 9 + + # test using token IDs + input_tokens = [1, 1, 1, 1, 1] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 5 + assert embeddings.usage.total_tokens == 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + # test List[str] + input_texts = [ + "The cat sat on the mat.", "A feline was resting on a rug.", + "Stars twinkle brightly in the night sky." + ] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) == 4096 + + # test List[List[int]] + input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], + [25, 32, 64, 77]] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 4 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 17 + assert embeddings.usage.total_tokens == 17 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index fdf704705..2d7e3044d 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -15,11 +15,10 @@ from openai import BadRequestError from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" @@ -80,9 +79,15 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + return RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -91,8 +96,6 @@ def server(zephyr_lora_files): "--max-model-len", "8192", "--enforce-eager", - "--gpu-memory-utilization", - "0.75", # lora config below "--enable-lora", "--lora-modules", @@ -105,43 +108,14 @@ def server(zephyr_lora_files): "--max-num-seqs", "128", ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def embedding_server(zephyr_lora_files): - ray.shutdown() - ray.init() - server_runner = ServerRunner.remote([ - "--model", - EMBEDDING_MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--gpu-memory-utilization", - "0.75", - "--max-model-len", - "8192", - ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() - - -@pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + return server.get_async_client() -@pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_single_completion(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", max_tokens=5, @@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_no_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( model=MODEL_NAME, @@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_zero_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( model=MODEL_NAME, @@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_some_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( model=MODEL_NAME, @@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, +async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, model_name: str): with pytest.raises( @@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, +async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", @@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_single_chat_session(server, client: openai.AsyncOpenAI, +async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", @@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_completion_streaming(server, client: openai.AsyncOpenAI, +async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str): prompt = "What is an LLM?" @@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_chat_streaming(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, "model_name", ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_chat_completion_stream_options(server, - client: openai.AsyncOpenAI, +async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", @@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server, "model_name", ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_completion_stream_options(server, client: openai.AsyncOpenAI, +async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" @@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_batch_completions(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): # test simple list batch = await client.completions.create( model=model_name, @@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_logits_bias(server, client: openai.AsyncOpenAI): +async def test_logits_bias(client: openai.AsyncOpenAI): prompt = "Hello, my name is" max_tokens = 5 tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) @@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_completion(server, client: openai.AsyncOpenAI, +async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_chat(server, client: openai.AsyncOpenAI, +async def test_guided_json_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, +async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, +async def test_guided_regex_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, +async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, +async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str): with pytest.raises(openai.BadRequestError): _ = await client.completions.create( @@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_named_tool_use(server, client: openai.AsyncOpenAI, +async def test_named_tool_use(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) async def test_required_tool_use_not_yet_supported( - server, client: openai.AsyncOpenAI, guided_decoding_backend: str): + client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported( @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) async def test_inconsistent_tool_choice_and_tools( - server, client: openai.AsyncOpenAI, guided_decoding_backend: str): + client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools( @pytest.mark.asyncio -async def test_response_format_json_object(server, client: openai.AsyncOpenAI): +async def test_response_format_json_object(client: openai.AsyncOpenAI): for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, @@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_extra_fields(server, client: openai.AsyncOpenAI): +async def test_extra_fields(client: openai.AsyncOpenAI): with pytest.raises(BadRequestError) as exc_info: await client.chat.completions.create( model=MODEL_NAME, @@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_complex_message_content(server, client: openai.AsyncOpenAI): +async def test_complex_message_content(client: openai.AsyncOpenAI): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ @@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_custom_role(server, client: openai.AsyncOpenAI): +async def test_custom_role(client: openai.AsyncOpenAI): # Not sure how the model handles custom roles so we just check that # both string and complex message content are handled in the same way @@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_guided_grammar(server, client: openai.AsyncOpenAI): +async def test_guided_grammar(client: openai.AsyncOpenAI): simple_sql_grammar = """ start: select_statement @@ -1351,7 +1315,7 @@ number: "1" | "2" [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) @pytest.mark.parametrize("logprobs_arg", [1, 0]) -async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, +async def test_echo_logprob_completion(client: openai.AsyncOpenAI, model_name: str, logprobs_arg: int): tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) # test using text and token IDs @@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_long_seed(server, client: openai.AsyncOpenAI): +async def test_long_seed(client: openai.AsyncOpenAI): for seed in [ torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1 @@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, - model_name: str): - input_texts = [ - "The chef prepared a delicious meal.", - ] - - # test single embedding - embeddings = await client.embeddings.create( - model=model_name, - input=input_texts, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 9 - assert embeddings.usage.total_tokens == 9 - - # test using token IDs - input_tokens = [1, 1, 1, 1, 1] - embeddings = await client.embeddings.create( - model=model_name, - input=input_tokens, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 5 - assert embeddings.usage.total_tokens == 5 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, - model_name: str): - # test List[str] - input_texts = [ - "The cat sat on the mat.", "A feline was resting on a rug.", - "Stars twinkle brightly in the night sky." - ] - embeddings = await client.embeddings.create( - model=model_name, - input=input_texts, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) == 4096 - - # test List[List[int]] - input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], - [25, 32, 64, 77]] - embeddings = await client.embeddings.create( - model=model_name, - input=input_tokens, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 4 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 17 - assert embeddings.usage.total_tokens == 17 - - if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index cc03b04e0..03dc5d116 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -8,7 +8,7 @@ import ray from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer MODEL_NAME = "llava-hf/llava-1.5-7b-hf" LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / @@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [ pytestmark = pytest.mark.openai +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + @pytest.fixture(scope="module") def server(): - ray.init() - server_runner = ServerRunner.remote([ + return RemoteOpenAIServer([ "--model", MODEL_NAME, "--dtype", @@ -47,18 +53,11 @@ def server(): "--chat-template", str(LLAVA_CHAT_TEMPLATE), ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() -@pytest.fixture(scope="session") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() @pytest_asyncio.fixture(scope="session") @@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]: @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, +async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ "role": @@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded( - server, client: openai.AsyncOpenAI, model_name: str, image_url: str, + client: openai.AsyncOpenAI, model_name: str, image_url: str, base64_encoded_image: Dict[str, str]): messages = [{ @@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, +async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ "role": @@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_multi_image_input(server, client: openai.AsyncOpenAI, - model_name: str, image_url: str): +async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, + image_url: str): messages = [{ "role": diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 9656cf5f4..c8f86133f 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -22,11 +22,12 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, tensorize_vllm_model) from ..conftest import VllmRunner, cleanup -from ..utils import ServerRunner +from ..utils import RemoteOpenAIServer # yapf conflicts with isort for this docstring + prompts = [ "Hello, my name is", "The president of the United States is", @@ -216,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): openai_args = [ "--model", model_ref, "--dtype", "float16", "--load-format", "tensorizer", "--model-loader-extra-config", - json.dumps(model_loader_extra_config), "--port", "8000" + json.dumps(model_loader_extra_config), ] - server = ServerRunner.remote(openai_args) - - assert ray.get(server.ready.remote()) + server = RemoteOpenAIServer(openai_args) print("Server ready.") - client = openai.OpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) + client = server.get_client() completion = client.completions.create(model=model_ref, prompt="Hello, my name is", max_tokens=5, diff --git a/tests/utils.py b/tests/utils.py index cc8b86276..c84364d20 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,57 +4,109 @@ import sys import time import warnings from contextlib import contextmanager +from typing import List +import openai import ray import requests from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) +from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.utils import get_open_port # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) -@ray.remote(num_gpus=1) -class ServerRunner: +class RemoteOpenAIServer: + DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + - args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, + @ray.remote(num_gpus=1) + class _RemoteRunner: + + def __init__(self, cli_args: List[str], *, wait_url: str, + wait_timeout: float) -> None: + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + self.proc = subprocess.Popen( + [ + sys.executable, "-m", "vllm.entrypoints.openai.api_server", + *cli_args + ], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + self._wait_for_server(url=wait_url, timeout=wait_timeout) + + def ready(self): + return True + + def _wait_for_server(self, *, url: str, timeout: float): + # run health check + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + if self.proc.poll() is not None: + raise RuntimeError( + "Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError( + "Server failed to start in time.") from err + + def __del__(self): + if hasattr(self, "proc"): + self.proc.terminate() + + def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: + if auto_port: + if "-p" in cli_args or "--port" in cli_args: + raise ValueError("You have manually specified the port" + "when `auto_port=True`.") + + cli_args = cli_args + ["--port", str(get_open_port())] + + parser = make_arg_parser() + args = parser.parse_args(cli_args) + self.host = str(args.host or 'localhost') + self.port = int(args.port) + + self._runner = self._RemoteRunner.remote( + cli_args, + wait_url=self.url_for("health"), + wait_timeout=self.MAX_SERVER_START_WAIT_S) + + self._wait_until_ready() + + @property + def url_root(self) -> str: + return f"http://{self.host}:{self.port}" + + def url_for(self, *parts: str) -> str: + return self.url_root + "/" + "/".join(parts) + + def _wait_until_ready(self) -> None: + ray.get(self._runner.ready.remote()) + + def get_client(self): + return openai.OpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, + ) + + def get_async_client(self): + return openai.AsyncOpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > self.MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() def init_test_distributed_environment( -- GitLab From 0ce7b952f8eafdb13a7b6de3af53157c7aae98d4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Jun 2024 02:22:07 +0800 Subject: [PATCH 025/376] [Doc] Update LLaVA docs (#5437) Co-authored-by: Roger Wang --- docs/source/models/vlm.rst | 4 +-- vllm/model_executor/models/llava.py | 29 +++++++++++--------- vllm/model_executor/models/llava_next.py | 34 ++++++++---------------- 3 files changed, 29 insertions(+), 38 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 5ab4157cb..70ac82e20 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -20,9 +20,9 @@ The following :ref:`engine arguments ` are specific to VLMs: Currently, the support for vision language models on vLLM has the following limitations: * Only single image input is supported per text prompt. - * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation. + * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation. - We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests. + We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub `_ if you have any feedback or feature requests. Offline Batched Inference ------------------------- diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 67b32a088..39355b9d3 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -227,7 +227,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): attn_metadata: AttentionMetadata, **kwargs: object, ) -> SamplerOutput: - """Run forward pass for Llava 1.5. + """Run forward pass for LLaVA-1.5. One key thing to understand is the `input_ids` already accounts for the positions of the to-be-inserted image embeddings. @@ -247,22 +247,25 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. - The model takes two types of image inputs: - PIXEL_VALUES and IMAGE_FEATURES. - The following shows how each maps to huggingface implementation. - PIXEL_VALUES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353 - IMAGE_FEATURES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430 - before going through the multi modal projector. + This model has two modes of image inputs: + `PIXEL_VALUES` and `IMAGE_FEATURES`. Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, 576, 1024]. + pixel_values: The pixels in each input image. + Expects a batch with shape `[1, 3, 336, 336]`. + (Only applicable to `PIXEL_VALUES` mode) + image_features: The image features for each input image outputted by + the vision tower before passing to the multi-modal projector. + Expects a batch with shape `[1, 576, 1024]`. + (Only applicable to `IMAGE_FEATURES` mode) + + See also: + Each input maps to huggingface implementation, as follows: + + - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360 + - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437 """ image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 57cbd1e4a..0ab9afea9 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -108,15 +108,6 @@ def _image_pixel_processor( @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor) @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data) class LlavaNextForConditionalGeneration(VisionLanguageModelBase): - """ - Args to `forward()`: - input_ids: Flattened (concatenated) input_ids corresponding to a - batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, num_patches, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, num_patches, 1176, 1024]. - """ def __init__(self, config: LlavaNextConfig, @@ -355,7 +346,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase): attn_metadata: AttentionMetadata, **kwargs: object, ) -> SamplerOutput: - """Run forward pass for Llava 1.5. + """Run forward pass for LlaVA-NeXT. One key thing to understand is the `input_ids` already accounts for the positions of the to-be-inserted image embeddings. @@ -375,22 +366,19 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase): This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. - The model takes two types of image inputs: - PIXEL_VALUES and IMAGE_FEATURES. - The following shows how each maps to huggingface implementation. - PIXEL_VALUES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353 - IMAGE_FEATURES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430 - before going through the multi modal projector. - Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, 576, 1024]. + pixel_values: The pixels in each grid patch for each input image. + Expects a batch with shape `[1, num_patches, 3, 336, 336]`. + image_sizes: The original `(width, height)` for each input image. + Expects a batch with shape `[1, 2]`. + + See also: + Each input maps to huggingface implementation, as follows: + + - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690 + - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691 """ image_input = self._parse_and_validate_image_input(**kwargs) -- GitLab From 85657b56071b7c21586d88389c6e817f11c69e04 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 14:22:19 -0400 Subject: [PATCH 026/376] [Kernel] Factor out epilogues from cutlass kernels (#5391) Co-authored-by: Michael Goin Co-authored-by: youkaichao Co-authored-by: zifeitong Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- CMakeLists.txt | 8 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 6 +- csrc/ops.h | 6 +- .../{scaled_mm_dq_c2x.cu => scaled_mm_c2x.cu} | 199 ++++++++++-------- .../{scaled_mm_dq_c3x.cu => scaled_mm_c3x.cu} | 194 +++++++++-------- ...aled_mm_dq_entry.cu => scaled_mm_entry.cu} | 48 ++--- csrc/torch_bindings.cpp | 8 +- tests/kernels/test_cutlass.py | 18 +- vllm/_custom_ops.py | 9 +- .../compressed_tensors_w8a8_dynamictoken.py | 4 +- .../compressed_tensors_w8a8_statictensor.py | 4 +- .../model_executor/layers/quantization/fp8.py | 2 +- 12 files changed, 274 insertions(+), 232 deletions(-) rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_c2x.cu => scaled_mm_c2x.cu} (71%) rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_c3x.cu => scaled_mm_c3x.cu} (66%) rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_entry.cu => scaled_mm_entry.cu} (50%) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad6736c47..aa15b632c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,9 +179,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/custom_all_reduce.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu") + "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") # # The CUTLASS kernels for Hopper require sm90a to be enabled. @@ -189,7 +189,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # That adds an extra 17MB to compiled binary, so instead we selectively enable it. if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) set_source_files_properties( - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a") diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 6de56f618..182105f0b 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor, def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, scale_b: torch.tensor, out_dtype: torch.dtype) -> torch.tensor: - return ops.cutlass_scaled_mm_dq(a, - b, - scale_a, - scale_b, - out_dtype=out_dtype) + return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype) # bench diff --git a/csrc/ops.h b/csrc/ops.h index 0c270a78c..9e2e977fa 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -90,9 +90,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits); -void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales); #endif diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu similarity index 71% rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu rename to csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 23a8b4070..7651268dc 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -29,21 +29,14 @@ using namespace cute; /* - This defines a quantized GEMM operation with dequantized output, similar to - torch._scaled_mm. It is defined using the CUTLASS 2.x API, and is used for + This file defines quantized GEMM operations using the CUTLASS 2.x API, for NVIDIA GPUs with SM versions prior to sm90 (Hopper). - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. + Epilogue functions can be defined to post-process the output before it is + written to GPU memory. + Epilogues must contain a public type named EVTCompute of type Sm80EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. */ namespace { @@ -83,27 +76,25 @@ struct enable_sm89_to_sm90 : Kernel { } }; -template typename ArchGuard, - typename ElementAB_, typename ElementD_, typename TileShape, - typename WarpShape, typename InstructionShape, int32_t MainLoopStages> -struct cutlass_2x_gemm { - using ElementAB = ElementAB_; - using ElementD = ElementD_; - - using ElementAcc = - typename std::conditional, int32_t, - float>::type; +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. - using Operator = - typename std::conditional, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::arch::OpMultiplyAdd>::type; + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). - using OutputTileThreadMap = - cutlass::epilogue::threadblock::OutputTileThreadLayout< - TileShape, WarpShape, float, 4, 1 /* epilogue stages */ - >; + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue { + private: using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< @@ -123,14 +114,56 @@ struct cutlass_2x_gemm { cutlass::multiplies, ElementD, float, cutlass::FloatRoundStyle::round_to_nearest>; - using EVTCompute1 = + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args}; + return evt_compute_args; + } +}; + +template typename ArchGuard, + typename ElementAB_, typename ElementD_, + template typename Epilogue_, typename TileShape, + typename WarpShape, typename InstructionShape, int32_t MainLoopStages> +struct cutlass_2x_gemm { + using ElementAB = ElementAB_; + using ElementD = ElementD_; + + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + + using Operator = + typename std::conditional, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::arch::OpMultiplyAdd>::type; + + using OutputTileThreadMap = + cutlass::epilogue::threadblock::OutputTileThreadLayout< + TileShape, WarpShape, float, 4, 1 /* epilogue stages */ + >; + + using Epilogue = Epilogue_; + using EVTCompute = typename Epilogue::EVTCompute; using D = cutlass::epilogue::threadblock::VisitorAuxStore< OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest, Stride, Int<0>>>; - using EVTD = cutlass::epilogue::threadblock::Sm80EVT; + using EVTD = cutlass::epilogue::threadblock::Sm80EVT; // clang-format off using RowMajor = typename cutlass::layout::RowMajor; @@ -153,11 +186,10 @@ struct cutlass_2x_gemm { using Op = cutlass::gemm::device::GemmUniversalAdapter; }; -template -void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +template +void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; @@ -177,23 +209,14 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, auto b_ptr = static_cast(b.data_ptr()); auto c_ptr = static_cast(out.data_ptr()); - auto a_scales_ptr = a_scales.data_ptr(); - auto b_scales_ptr = b_scales.data_ptr(); - - using ScaleAArgs = typename Gemm::ScaleA::Arguments; - using ScaleBArgs = typename Gemm::ScaleB::Arguments; - - ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; - ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; - - typename Gemm::EVTCompute0::Arguments evt0_compute_args{b_args}; - - typename Gemm::EVTCompute1::Arguments evt1_compute_args{a_args, - evt0_compute_args}; typename Gemm::D::Arguments d_args{c_ptr, c_stride}; + using Epilogue = typename Gemm::Epilogue; + auto evt_args = + Epilogue::prepare_args(std::forward(epilogue_params)...); + typename Gemm::EVTD::Arguments epilogue_args{ - evt1_compute_args, + evt_args, d_args, }; @@ -229,10 +252,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, } // namespace -void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); @@ -243,23 +266,23 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a, using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>; if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, a_scales, b_scales); } } -void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); @@ -270,23 +293,23 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a, using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>( + out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>( + out, a, b, a_scales, b_scales); } } -void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; @@ -298,32 +321,32 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>( + out, a, b, a_scales, b_scales); } else { assert(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_dispatcher>(out, a, b, a_scales, - b_scales); + ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>( + out, a, b, a_scales, b_scales); } } else { TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher>( - out, a, b, a_scales, b_scales); + cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape, + InstructionShape, 5>>(out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_dispatcher>( - out, a, b, a_scales, b_scales); + cutlass::half_t, ScaledEpilogue, TileShape, WarpShape, + InstructionShape, 5>>(out, a, b, a_scales, b_scales); } } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu similarity index 66% rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu rename to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index a99802153..f1a2b73ff 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -32,21 +32,14 @@ using namespace cute; /* - This defines a quantized GEMM operation with dequantized output, similar to - torch._scaled_mm. It is defined using the CUTLASS 3.x API, and is used for + This file defines quantized GEMM operations using the CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later. - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. + Epilogue functions can be defined to post-process the output before it is + written to GPU memory. + Epilogues must contain a public type named EVTCompute of type Sm90EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. */ namespace { @@ -71,21 +64,25 @@ struct enable_sm90_or_later : Kernel { } }; -template -struct cutlass_3x_gemm { - using ElementAB = ElementAB_; - using ElementD = ElementD_; - using ElementAcc = - typename std::conditional, int32_t, - float>::type; +/* + This epilogue function defines a quantized GEMM operation similar to + torch.scaled_mm_. - using EpilogueDescriptor = - cutlass::epilogue::collective::detail::EpilogueDescriptor< - TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD, - ElementD, EpilogueSchedule>; + A and B may be both either int8 or fp8_e4m3. A can be + quantized per-tensor or per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue { + private: using Accum = cutlass::epilogue::fusion::Sm90AccFetch; using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< @@ -111,19 +108,53 @@ struct cutlass_3x_gemm { cutlass::multiplies, ElementD, float, cutlass::FloatRoundStyle::round_to_nearest>; - using EVTCompute1 = + public: + using EVTCompute = cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using ScaleA_Args = typename ScaleA::Arguments; + using ScaleB_Args = typename ScaleB::Arguments; + + ScaleA_Args a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + ScaleB_Args b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + + return ArgumentType{a_args, {b_args}}; + } +}; + +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule> +struct cutlass_3x_gemm { + using ElementAB = ElementAB_; + using ElementD = ElementD_; + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + + using EpilogueDescriptor = + cutlass::epilogue::collective::detail::EpilogueDescriptor< + TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD, + ElementD, EpilogueSchedule>; + + using Epilogue = Epilogue_; using StrideD = Stride, Int<0>>; using ElementC = void; using StrideC = StrideD; + using EVTCompute = typename Epilogue::EVTCompute; + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4, - EpilogueSchedule, EVTCompute1>::CollectiveOp; + EpilogueSchedule, EVTCompute>::CollectiveOp; static constexpr size_t CEStorageSize = sizeof(typename CollectiveEpilogue::SharedStorage); @@ -148,11 +179,10 @@ struct cutlass_3x_gemm { struct GemmKernel : public KernelType {}; }; -template -void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +template +void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; @@ -182,19 +212,13 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ - {}, c_ptr, c_stride, c_ptr, c_stride}; + Gemm::Epilogue::prepare_args( + std::forward(epilogue_params)...), + c_ptr, c_stride, c_ptr, c_stride}; typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm, prob_shape, mainloop_args, epilogue_args}; - using ScaleA_Args = typename Gemm::ScaleA::Arguments; - using ScaleB_Args = typename Gemm::ScaleB::Arguments; - - ScaleA_Args a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; - ScaleB_Args b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; - - args.epilogue.thread = {a_args, {b_args}}; - // Launch the CUTLASS GEMM kernel. using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; GemmOp gemm_op; @@ -209,7 +233,8 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(status); } -template +template typename Epilogue, int32_t M> struct sm90_fp8_config { static_assert(std::is_same()); using KernelSchedule = @@ -219,12 +244,13 @@ struct sm90_fp8_config { using ClusterShape = Shape<_2, _1, _1>; using Cutlass3xGemm = - cutlass_3x_gemm; + cutlass_3x_gemm; }; -template -struct sm90_fp8_config { +template typename Epilogue> +struct sm90_fp8_config { static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; @@ -233,12 +259,13 @@ struct sm90_fp8_config { using ClusterShape = Shape<_2, _1, _1>; using Cutlass3xGemm = - cutlass_3x_gemm; + cutlass_3x_gemm; }; -template -struct sm90_fp8_config { +template typename Epilogue> +struct sm90_fp8_config { static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; @@ -247,30 +274,28 @@ struct sm90_fp8_config { using ClusterShape = Shape<_1, _8, _1>; using Cutlass3xGemm = - cutlass_3x_gemm; + cutlass_3x_gemm; }; } // namespace -template -void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out, - torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { static_assert(std::is_same()); TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); using Cutlass3xGemmDefault = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config::Cutlass3xGemm; using Cutlass3xGemmM64 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config::Cutlass3xGemm; using Cutlass3xGemmM128 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config::Cutlass3xGemm; uint32_t const m = a.size(0); uint32_t const mp2 = @@ -278,23 +303,23 @@ void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out, if (mp2 <= 64) { // m in [1, 64] - return cutlass_scaled_mm_dq_dispatcher( - out, a, b, a_scales, b_scales); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); } else if (mp2 <= 128) { // m in (64, 128] - return cutlass_scaled_mm_dq_dispatcher( - out, a, b, a_scales, b_scales); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); } else { // m in (128, inf) - return cutlass_scaled_mm_dq_dispatcher( - out, a, b, a_scales, b_scales); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); } } -void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -308,16 +333,15 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher< - cutlass_3x_gemm>( - out, a, b, a_scales, b_scales); + return cutlass_gemm_caller>(out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_dispatcher< - cutlass_3x_gemm>( + return cutlass_gemm_caller< + cutlass_3x_gemm>( out, a, b, a_scales, b_scales); } } else { @@ -325,13 +349,13 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_sm90_fp8_dispatch( + return cutlass_gemm_sm90_fp8_dispatch< + cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>( out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_scaled_mm_dq_sm90_fp8_dispatch( + return cutlass_gemm_sm90_fp8_dispatch( out, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu similarity index 50% rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu rename to csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 423e64a49..687f8efd8 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -3,31 +3,31 @@ #include #include -void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq_sm80(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); #if defined CUDA_VERSION && CUDA_VERSION >= 12000 -void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); #endif -void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { int32_t major_capability; int32_t minor_capability; cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor, @@ -57,19 +57,19 @@ void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, // Guard against compilation issues for sm90 kernels #if defined CUDA_VERSION && CUDA_VERSION >= 12000 - cutlass_scaled_mm_dq_sm90(c, a, b, a_scales, b_scales); + cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales); #else - cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales); + cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales); #endif } else if (version_num == 89) { // Ada Lovelace - cutlass_scaled_mm_dq_sm89(c, a, b, a_scales, b_scales); + cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales); } else if (version_num >= 80) { // Ampere - cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales); + cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales); } else { // Turing TORCH_CHECK(version_num >= 75); - cutlass_scaled_mm_dq_sm75(c, a, b, a_scales, b_scales); + cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales); } } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index df2603544..867bf4389 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -136,10 +136,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // quantization. ops.def( - "cutlass_scaled_mm_dq(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales) -> ()"); - ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq); + "cutlass_scaled_mm(Tensor! out, Tensor a," + " Tensor b, Tensor a_scales," + " Tensor b_scales) -> ()"); + ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); #endif // Quantized GEMM for GPTQ. diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 079d9650c..777138ace 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int, scale_b = (torch.randn( (1, n_b_scales), device=device, dtype=torch.float32) / 10) - out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) + out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype) baseline = torch.mm(scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)).to(out_dtype) @@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int, scale_b = (torch.randn( (1, n_b_scales), device=device, dtype=torch.float32) / 10) - out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) + out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype) baseline = torch.mm(scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)).to(dtype=out_dtype) @@ -180,11 +180,11 @@ def test_cutlass_subset(): scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - out = ops.cutlass_scaled_mm_dq(a, - b, - scale_a, - scale_b, - out_dtype=torch.bfloat16) + out = ops.cutlass_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) baseline = torch.mm(scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)).to(dtype=torch.bfloat16) @@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module): self.out_dtype = out_dtype def forward(self, a): - return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b, - self.out_dtype) + return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b, + self.out_dtype) @pytest.mark.parametrize("per_act_token", [True, False]) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 955086be1..2f84b8bde 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -212,9 +212,9 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # cutlass -def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor, - scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype]) -> torch.Tensor: +def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype]) -> torch.Tensor: assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) @@ -222,8 +222,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor, n = b.shape[1] out = torch.empty((m, n), dtype=out_dtype, device=a.device) - torch.ops._C.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b) - + torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b) return out diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 25b707cae..9bb7bf447 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): weight_scale = layer.weight_scale x_q, input_scales = custom_ops.scaled_int8_quant(x) - return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales, - weight_scale, x.dtype) + return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales, + weight_scale, x.dtype) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 7559fc0f9..88c15c5c2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): # Input quantize x_q, _ = custom_ops.scaled_int8_quant(x, act_scale) - return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale, - weight_scale, x.dtype) + return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale, + weight_scale, x.dtype) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 0cf2bd927..e89fd6581 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -261,7 +261,7 @@ class Fp8LinearMethod(LinearMethodBase): qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ - output = ops.cutlass_scaled_mm_dq( + output = ops.cutlass_scaled_mm( qinput, layer.weight, out_dtype=x.dtype, -- GitLab From 30299a41fa78c7bf485aca7ef8ad584ca340a64d Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 13 Jun 2024 11:22:30 -0700 Subject: [PATCH 027/376] [MISC] Remove FP8 warning (#5472) Co-authored-by: Philipp Moritz --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 2513d43ce..76c10d464 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -212,7 +212,7 @@ class ModelConfig: f"{self.quantization} quantization is currently not " f"supported in ROCm.") if (self.quantization - not in ["marlin", "gptq_marlin_24", "gptq_marlin"]): + not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin")): logger.warning( "%s quantization is not fully " "optimized yet. The speed can be slower than " -- GitLab From a8fda4f66131e211ac1e64f6b1d74123e0347a1c Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 13 Jun 2024 11:22:41 -0700 Subject: [PATCH 028/376] Seperate dev requirements into lint and test (#5474) --- Dockerfile | 2 ++ requirements-dev.txt | 40 ++++------------------------------------ requirements-lint.txt | 14 ++++++++++++++ requirements-test.txt | 22 ++++++++++++++++++++++ 4 files changed, 42 insertions(+), 36 deletions(-) create mode 100644 requirements-lint.txt create mode 100644 requirements-test.txt diff --git a/Dockerfile b/Dockerfile index 62c401069..72894e7ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-cuda.txt # install development dependencies +COPY requirements-lint.txt requirements-lint.txt +COPY requirements-test.txt requirements-test.txt COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt index b380ef205..421aa2e79 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,37 +1,5 @@ -# formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.1 -ruff==0.1.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 +-r requirements-lint.txt +-r requirements-test.txt -# type checking -mypy==1.9.0 -types-PyYAML -types-requests -types-setuptools - -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard - -# testing utils -awscli -einops # required for MPT -httpx -peft -requests -ray -sentence-transformers # required for embedding - -# Benchmarking -aiohttp - -# quantization -bitsandbytes==0.42.0 +# Avoid adding requirements directly to this file. +# Instead, modify the two files referenced above. diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 000000000..bd34227d3 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,14 @@ +# formatting +yapf==0.32.0 +toml==0.10.2 +tomli==2.0.1 +ruff==0.1.5 +codespell==2.3.0 +isort==5.13.2 +clang-format==18.1.5 + +# type checking +mypy==1.9.0 +types-PyYAML +types-requests +types-setuptools diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 000000000..8b68e0e93 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,22 @@ +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +einops # required for MPT +httpx +peft +requests +ray +sentence-transformers # required for embedding + +# Benchmarking +aiohttp + +# quantization +bitsandbytes==0.42.0 -- GitLab From 6b0511a57bdba85efe2b4d5588dd16280c8fdc78 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 13 Jun 2024 11:22:50 -0700 Subject: [PATCH 029/376] Revert "[Core] Remove unnecessary copies in flash attn backend" (#5478) --- vllm/attention/backends/flash_attn.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 8c64c2bfd..300bab728 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -317,7 +317,7 @@ class FlashAttentionImpl(AttentionImpl): # normal attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. - flash_attn_varlen_func( + out = flash_attn_varlen_func( q=query, k=key, v=value, @@ -329,13 +329,14 @@ class FlashAttentionImpl(AttentionImpl): causal=True, window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, - out=output[:num_prefill_tokens], ) + assert output[:num_prefill_tokens].shape == out.shape + output[:num_prefill_tokens] = out else: # prefix-enabled attention assert prefill_meta.seq_lens is not None max_seq_len = max(prefill_meta.seq_lens) - flash_attn_varlen_func( + output[:num_prefill_tokens] = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, @@ -347,12 +348,11 @@ class FlashAttentionImpl(AttentionImpl): causal=True, alibi_slopes=self.alibi_slopes, block_table=prefill_meta.block_tables, - out=output[:num_prefill_tokens], ) if decode_meta := attn_metadata.decode_metadata: # Decoding run. - flash_attn_with_kvcache( + output[num_prefill_tokens:] = flash_attn_with_kvcache( decode_query.unsqueeze(1), key_cache, value_cache, @@ -361,8 +361,7 @@ class FlashAttentionImpl(AttentionImpl): softmax_scale=self.scale, causal=True, alibi_slopes=self.alibi_slopes, - out=output[num_prefill_tokens:].unsqueeze(1), - ) + ).squeeze(1) # Reshape the output tensor. return output.view(num_tokens, hidden_size) -- GitLab From 1696efe6c91a82e1aca5b49f4bc7899802115981 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 13 Jun 2024 12:09:16 -0700 Subject: [PATCH 030/376] [misc] fix format.sh (#5511) --- format.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/format.sh b/format.sh index 6057b69af..2fd6af03b 100755 --- a/format.sh +++ b/format.sh @@ -36,12 +36,12 @@ tool_version_check() { fi } -tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)" +tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)" YAPF_FLAGS=( '--recursive' -- GitLab From 33e3b372429232cea44266d866906effaa705a10 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 16:37:48 -0400 Subject: [PATCH 031/376] [CI/Build] Disable test_fp8.py (#5508) --- tests/models/test_fp8.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index b24c17cf3..2b5609188 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -68,6 +68,14 @@ EXPECTED_STRS_MAP = { } +# This test compares against golden strings for exact match since +# there is no baseline implementation to compare against +# and is unstable w.r.t specifics of the fp8 implementation or +# the hardware being run on. +# Disabled to prevent it from breaking the build +@pytest.mark.skip( + reason= + "Prevent unstable test based on golden strings from breaking the build.") @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) -- GitLab From e38042d4af1ddb390c3dd9340250de25bee37c62 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 16:38:05 -0400 Subject: [PATCH 032/376] [Kernel] Disable CUTLASS kernels for fp8 (#5505) --- vllm/model_executor/layers/quantization/fp8.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e89fd6581..bc08bfcc3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,7 +257,9 @@ class Fp8LinearMethod(LinearMethodBase): # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - if bias is None and self.cutlass_fp8_supported: + # Temporarily disable CUTLASS kernels due to an illegal memory access + #if bias is None and self.cutlass_fp8_supported: + if False: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ -- GitLab From 50eed24d252965a81ce50b64fd387d60fb1f4f6e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 13 Jun 2024 16:06:49 -0700 Subject: [PATCH 033/376] Add `cuda_device_count_stateless` (#5473) --- .buildkite/test-pipeline.yaml | 1 + tests/conftest.py | 17 ++------- tests/distributed/test_utils.py | 31 ++++++++++++++++ vllm/config.py | 6 ++-- .../device_communicators/custom_all_reduce.py | 3 +- .../custom_all_reduce_utils.py | 3 +- vllm/executor/multiproc_gpu_executor.py | 6 ++-- vllm/utils.py | 35 +++++++++++++++++++ 8 files changed, 79 insertions(+), 23 deletions(-) create mode 100644 tests/distributed/test_utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6b12d19ba..6a2932db9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -48,6 +48,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - pytest -v -s spec_decode/e2e/test_integration_dist.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - label: Distributed Tests (Multiple Groups) #mirror_hardwares: [amd] diff --git a/tests/conftest.py b/tests/conftest.py index 29a4f126f..18aea3702 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,6 @@ import contextlib import gc import os -import subprocess -import sys from typing import Any, Dict, List, Optional, Tuple, TypeVar import pytest @@ -22,7 +20,7 @@ from vllm.logger import init_logger from vllm.multimodal import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu +from vllm.utils import cuda_device_count_stateless, is_cpu logger = init_logger(__name__) @@ -539,15 +537,4 @@ def num_gpus_available(): """Get number of GPUs without initializing the CUDA context in current process.""" - try: - out = subprocess.run([ - sys.executable, "-c", - "import torch; print(torch.cuda.device_count())" - ], - capture_output=True, - check=True, - text=True) - except subprocess.CalledProcessError as e: - logger.warning("Failed to get number of GPUs.", exc_info=e) - return 0 - return int(out.stdout.strip()) + return cuda_device_count_stateless() diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py new file mode 100644 index 000000000..b7ec59c7a --- /dev/null +++ b/tests/distributed/test_utils.py @@ -0,0 +1,31 @@ +import os + +import ray + +from vllm.utils import cuda_device_count_stateless + + +@ray.remote +class _CUDADeviceCountStatelessTestActor(): + + def get_count(self): + return cuda_device_count_stateless() + + def set_cuda_visible_devices(self, cuda_visible_devices: str): + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + + def get_cuda_visible_devices(self): + return os.environ["CUDA_VISIBLE_DEVICES"] + + +def test_cuda_device_count_stateless(): + """Test that cuda_device_count_stateless changes return value if + CUDA_VISIBLE_DEVICES is changed.""" + + actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() + assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1" + assert ray.get(actor.get_count.remote()) == 2 + ray.get(actor.set_cuda_visible_devices.remote("0")) + assert ray.get(actor.get_count.remote()) == 1 + ray.get(actor.set_cuda_visible_devices.remote("")) + assert ray.get(actor.get_count.remote()) == 0 diff --git a/vllm/config.py b/vllm/config.py index 76c10d464..d9e4a619e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,7 +11,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu +from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, + is_hip, is_neuron, is_tpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -605,12 +606,11 @@ class ParallelConfig: if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. - from torch.cuda import device_count from vllm.executor import ray_utils backend = "mp" ray_found = ray_utils.ray is not None - if device_count() < self.world_size: + if cuda_device_count_stateless() < self.world_size: if not ray_found: raise ValueError("Unable to load Ray which is " "required for multi-node inference") diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 9a2b47594..b0cb21a02 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -11,6 +11,7 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import ( gpu_p2p_access_check) from vllm.distributed.parallel_state import is_in_the_same_node from vllm.logger import init_logger +from vllm.utils import cuda_device_count_stateless try: import pynvml @@ -144,7 +145,7 @@ class CustomAllreduce: if cuda_visible_devices: device_ids = list(map(int, cuda_visible_devices.split(","))) else: - device_ids = list(range(torch.cuda.device_count())) + device_ids = list(range(cuda_device_count_stateless())) physical_device_id = device_ids[device.index] tensor = torch.tensor([physical_device_id], diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 1fd0058f6..c9573edb0 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -12,6 +12,7 @@ import torch.multiprocessing as mp import vllm.envs as envs from vllm.logger import init_logger +from vllm.utils import cuda_device_count_stateless logger = init_logger(__name__) @@ -152,7 +153,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: is_distributed = dist.is_initialized() - num_dev = torch.cuda.device_count() + num_dev = cuda_device_count_stateless() cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES if cuda_visible_devices is None: cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 99c9e5203..8385e56f8 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, +from vllm.utils import (cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) logger = init_logger(__name__) @@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - from torch.cuda import device_count - assert world_size <= device_count(), ( + assert world_size <= cuda_device_count_stateless(), ( "please set tensor_parallel_size to less than max local gpu count") distributed_init_method = get_distributed_init_method( diff --git a/vllm/utils.py b/vllm/utils.py index af585929d..b5c42605b 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -693,3 +693,38 @@ def deprecate_kwargs( return inner # type: ignore return wrapper + + +@lru_cache(maxsize=8) +def _cuda_device_count_stateless( + cuda_visible_devices: Optional[str] = None) -> int: + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + import torch.version + + if not torch.cuda._is_compiled(): + return 0 + # bypass _device_count_nvml() if rocm (not supported) + nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count + return r + + +def cuda_device_count_stateless() -> int: + """Get number of CUDA devices, caching based on the value of + CUDA_VISIBLE_DEVICES at the time of call. + + This should be used instead of torch.cuda.device_count() + unless CUDA_VISIBLE_DEVICES has already been set to the desired + value.""" + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released. + + return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) -- GitLab From cd9c0d65d98f86fbd2235ee41b80107097a57f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Fri, 14 Jun 2024 07:22:24 +0800 Subject: [PATCH 034/376] [Hardware][Intel] Support CPU inference with AVX2 ISA (#5452) --- cmake/cpu_extension.cmake | 6 +- csrc/cpu/cpu_types.hpp | 165 +++++++++++++++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 2 deletions(-) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 61d484383..a644e5b6a 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -33,6 +33,7 @@ function (find_isa CPUINFO TARGET OUT) endif() endfunction() +find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) if (AVX512_FOUND) @@ -53,8 +54,11 @@ if (AVX512_FOUND) else() message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") endif() +elseif (AVX2_FOUND) + list(APPEND CXX_COMPILE_FLAGS "-mavx2") + message(WARNING "vLLM CPU backend using AVX2 ISA") else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.") endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 034c406a5..d7621aaae 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -5,6 +5,10 @@ #include #include +#ifndef __AVX2__ +static_assert(false, "AVX2 must be supported for the current implementation."); +#endif + namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU @@ -104,6 +108,7 @@ struct BF16Vec16 : public Vec { void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } }; +#ifdef __AVX512F__ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; @@ -123,6 +128,34 @@ struct BF16Vec32 : public Vec { void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } }; +#else +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + __m256i reg_low; + __m256i reg_high; + + explicit BF16Vec32(const void *ptr) + : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), + reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + + explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), + reg_high(high) {} + + explicit BF16Vec32(BF16Vec8 &vec8_data) + : reg_low((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), + reg_high((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} + + void save(void *ptr) const { + *reinterpret_cast<__m256i *>(ptr) = reg_low; + *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + } +}; +#endif struct FP32Vec4 : public Vec { constexpr static int VEC_ELEM_NUM = 4; @@ -226,6 +259,7 @@ struct FP32Vec8 : public Vec { void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } }; +#ifdef __AVX512F__ struct FP32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { @@ -290,6 +324,114 @@ struct FP32Vec16 : public Vec { void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } }; +#else +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + union AliasReg { + __m256 reg; + float values[8]; + }; + + __m256 reg_low; + __m256 reg_high; + + explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), + reg_high(_mm256_set1_ps(v)) {} + + explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), + reg_high(_mm256_set1_ps(0.0)) {} + + explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), + reg_high(_mm256_loadu_ps(ptr + 8)) {} + + explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} + + explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), + reg_high(data.reg_high) {} + + explicit FP32Vec16(const FP32Vec4 &data) + : reg_low((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)), + reg_high((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)) {} + + explicit FP32Vec16(const FP32Vec8 &data) + : reg_low(data.reg), reg_high(data.reg) {} + + explicit FP32Vec16(const BF16Vec16 &v) { + __m128i low = _mm256_extractf128_si256(v.reg, 0); + __m128i high = _mm256_extractf128_si256(v.reg, 1); + + __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low); + __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high); + + __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2); + __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2); + + reg_low = _mm256_castsi256_ps(v_low_shifted); + reg_high = _mm256_castsi256_ps(v_high_shifted); + } + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), + _mm256_mul_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), + _mm256_add_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), + _mm256_sub_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), + _mm256_div_ps(reg_high, b.reg_high)); + } + + float reduce_sum() const { + FP32Vec8 low = FP32Vec8(reg_low); + FP32Vec8 high = FP32Vec8(reg_high); + return low.reduce_sum() + high.reduce_sum(); + } + + template float reduce_sub_sum(int idx) { + float sum = 0.0; + static_assert(VEC_ELEM_NUM % group_size == 0); + constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); + uint32_t mask = base_mask << (idx * group_size); + + AliasReg ar; + + auto func = [&sum, &mask, &ar](int i) { + int flag = mask & 0x1; + mask = mask >> 1; + if (flag != 0) sum += ar.values[i]; + }; + + ar.reg = reg_low; + unroll_loop(func); + + ar.reg = reg_high; + unroll_loop(func); + + return sum; + } + + void save(float *ptr) const { + _mm256_storeu_ps(ptr, reg_low); + _mm256_storeu_ps(ptr + 8, reg_high); + } +}; +#endif template struct VecType { using vec_type = void; }; @@ -336,6 +478,7 @@ template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { *ptr = *(v_ptr + 1); } +#ifdef __AVX512F__ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} @@ -343,7 +486,27 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#endif +#else +namespace{ +__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { + __m256i ai = _mm256_castps_si256(a); + ai = _mm256_srli_epi32(ai, 16); + ai = _mm256_packus_epi32(ai, ai); + ai = _mm256_permute4x64_epi64(ai, 0b00111001); + return _mm256_extracti128_si256(ai, 0); +} +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { + BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); + BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); + reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); +} +#endif // __AVX512F__ +#endif // __AVX512BF16__ inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } -- GitLab From 55d6361b13ae6328de809f57a69b719c1600040a Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Fri, 14 Jun 2024 10:02:53 +0800 Subject: [PATCH 035/376] [Misc] Fix arg names in quantizer script (#5507) --- examples/fp8/quantizer/quantize.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py index cee13b4c9..15f1a06b1 100644 --- a/examples/fp8/quantizer/quantize.py +++ b/examples/fp8/quantizer/quantize.py @@ -332,7 +332,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model_dir", + parser.add_argument("--model-dir", help="Specify where the HuggingFace model is", required=True) parser.add_argument("--device", default="cuda") @@ -346,19 +346,19 @@ if __name__ == "__main__": "full_prec" ], ) - parser.add_argument("--batch_size", + parser.add_argument("--batch-size", help="Batch size for calibration.", type=int, default=1) - parser.add_argument("--calib_size", + parser.add_argument("--calib-size", help="Number of samples for calibration.", type=int, default=512) - parser.add_argument("--output_dir", default="exported_model") - parser.add_argument("--tp_size", type=int, default=1) - parser.add_argument("--pp_size", type=int, default=1) - parser.add_argument("--awq_block_size", type=int, default=128) - parser.add_argument("--kv_cache_dtype", + parser.add_argument("--output-dir", default="exported_model") + parser.add_argument("--tp-size", type=int, default=1) + parser.add_argument("--pp-size", type=int, default=1) + parser.add_argument("--awq-block-size", type=int, default=128) + parser.add_argument("--kv-cache-dtype", help="KV Cache dtype.", default=None, choices=["int8", "fp8", None]) -- GitLab From 0f0d8bc065f3608e7657a9696f5d2d7c0d6722d1 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 13 Jun 2024 21:42:06 -0500 Subject: [PATCH 036/376] bump version to v0.5.0.post1 (#5522) --- vllm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/version.py b/vllm/version.py index 3d187266f..2b33ffcf5 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "0.5.0.post1" -- GitLab From 319ad7f1d386699e94f629341c9988a926821f24 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 13 Jun 2024 22:36:20 -0700 Subject: [PATCH 037/376] [CI/Build][Misc] Add CI that benchmarks vllm performance on those PRs with `perf-benchmarks` label (#5073) Co-authored-by: simon-mo --- .buildkite/nightly-benchmarks/README.md | 98 +++++ .../benchmark-pipeline.yaml | 61 +++ .../nightly-benchmarks/kickoff-pipeline.sh | 3 +- .../nightly-benchmarks/latency-tests.json | 32 ++ .../run-benchmarks-suite.sh | 358 ++++++++++++++++++ .buildkite/nightly-benchmarks/sample.yaml | 39 -- .../convert-results-json-to-markdown.py | 155 ++++++++ .../scripts/wait-for-image.sh | 17 + .../nightly-benchmarks/serving-tests.json | 59 +++ .../nightly-benchmarks/throughput-tests.json | 35 ++ benchmarks/benchmark_latency.py | 25 ++ benchmarks/benchmark_serving.py | 11 + benchmarks/benchmark_throughput.py | 28 +- 13 files changed, 880 insertions(+), 41 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/README.md create mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml create mode 100644 .buildkite/nightly-benchmarks/latency-tests.json create mode 100644 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh delete mode 100644 .buildkite/nightly-benchmarks/sample.yaml create mode 100644 .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py create mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh create mode 100644 .buildkite/nightly-benchmarks/serving-tests.json create mode 100644 .buildkite/nightly-benchmarks/throughput-tests.json diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md new file mode 100644 index 000000000..6a18be947 --- /dev/null +++ b/.buildkite/nightly-benchmarks/README.md @@ -0,0 +1,98 @@ +# vLLM benchmark suite + +## Introduction + +This directory contains the performance benchmarking CI for vllm. +The goal is to help developers know the impact of their PRs on the performance of vllm. + +This benchmark will be *triggered* upon: +- A PR being merged into vllm. +- Every commit for those PRs with `perf-benchmarks` label. + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models. + +**Benchmarking Duration**: about 1hr. + +## Configuring the workload for the quick benchmark + +The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`. + +### Latency test + +Here is an example of one test inside `latency-tests.json`: + +```json +[ + ... + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + ... +] +``` + +In this example: +- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. +- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` + +Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. + + +### Throughput test +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. + +The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. + +### Serving test +We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: + +``` +[ + ... + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + ... +] +``` + +Inside this example: +- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. +- The `server-parameters` includes the command line arguments for vLLM server. +- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` + +The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. + +## Visualizing the results +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table. +You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. +If you do not see the table, please wait till the benchmark finish running. +The JSON file is also attached within each buildkite job for further analysis. \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml new file mode 100644 index 000000000..8f12748b6 --- /dev/null +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -0,0 +1,61 @@ +steps: + - label: "Wait for container to be ready" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: badouralix/curl-jq + command: + - sh + - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100 Benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + # - label: "H100: NVIDIA SMI" + # agents: + # queue: H100 + # plugins: + # - docker#v5.11.0: + # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + # command: + # - bash + # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + # mount-buildkite-agent: true + # propagate-environment: true + # propagate-uid-gid: false + # ipc: host + # gpus: all + # environment: + # - VLLM_USAGE_SOURCE + # - HF_TOKEN + diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index d3bf3b729..15d411feb 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# NOTE(simon): this script runs inside a buildkite agent with CPU only access. set -euo pipefail # Install system packages @@ -23,4 +24,4 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then fi # Upload sample.yaml -buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml +buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/latency-tests.json new file mode 100644 index 000000000..294a8c439 --- /dev/null +++ b/.buildkite/nightly-benchmarks/latency-tests.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + } +] diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh new file mode 100644 index 000000000..6cff6917f --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# This script should be run inside the CI process +# This script assumes that we are already inside the vllm/ directory +# Benchmarking results will be available inside vllm/benchmarks/results/ + +# Do not set -e, as the mixtral 8x22B model tends to crash occasionally +# and we still want to see other benchmarking results even when mixtral crashes. +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +kill_gpu_processes() { + # kill all processes on GPU. + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) + if [ -z "$pids" ]; then + echo "No GPU processes found." + else + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done + + echo "All GPU processes have been killed." + fi + + # waiting for GPU processes to be fully killed + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + # check if there is enough GPU to run the test + tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + latency_command="python3 benchmark_latency.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg latency "$latency_command" \ + --arg gpu "$gpu_type" \ + '{ + latency_command: $latency, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$latency_command" + + kill_gpu_processes + + done +} + + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + # check if there is enough GPU to run the test + tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + throughput_command="python3 benchmark_throughput.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg command "$throughput_command" \ + --arg gpu "$gpu_type" \ + '{ + throughput_command: $command, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$throughput_command" + + kill_gpu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $testname." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + check_gpus + check_hf_token + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + + # get the current IP address, required by benchmark_serving.py + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # prepare for benchmarking + cd benchmarks || exit 1 + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + # benchmarking + run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json + + + # postprocess benchmarking results + pip install tabulate pandas + python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + + upload_to_buildkite +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml deleted file mode 100644 index 50e6e8207..000000000 --- a/.buildkite/nightly-benchmarks/sample.yaml +++ /dev/null @@ -1,39 +0,0 @@ -steps: - # NOTE(simon): You can create separate blocks for different jobs - - label: "A100: NVIDIA SMI" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT - # TODO(simon): check latest main branch or use the PR image. - - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - command: - - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls' - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - # TODO(simon): bring H100 online - # - label: "H100: NVIDIA SMI" - # agents: - # queue: H100 - # plugins: - # - docker#v5.11.0: - # image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - # command: - # - bash -c 'nvidia-smi && nvidia-smi topo -m' - # propagate-environment: true - # ipc: host - # gpus: all - diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py new file mode 100644 index 000000000..75cff8434 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -0,0 +1,155 @@ +import json +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +results_folder = Path("results/") + +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "avg_latency": "Average latency (s)", + "P10": "P10 (s)", + "P25": "P25 (s)", + "P50": "P50 (s)", + "P75": "P75 (s)", + "P90": "P90 (s)", +} + +# thoughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "num_requests": "# of req.", + "total_num_tokens": "Total # of tokens", + "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "completed": "# of req.", + "request_throughput": "Tput (req/s)", + "input_throughput": "Input Tput (tok/s)", + "output_throughput": "Output Tput (tok/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + # do not say TTFT again to avoid the table getting too wide + "median_ttft_ms": "Median", + "p99_ttft_ms": "P99", + "mean_tpot_ms": "Mean TPOT (ms)", + "median_tpot_ms": "Median", + "p99_tpot_ms": "P99", + "mean_itl_ms": "Mean ITL (ms)", + "median_itl_ms": "Median", + "p99_itl_ms": "P99", +} + +for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90]: + raw_result.update( + {f"P{perc}": raw_result["percentiles"][str(perc)]}) + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + +latency_results = pd.DataFrame.from_dict(latency_results) +serving_results = pd.DataFrame.from_dict(serving_results) +throughput_results = pd.DataFrame.from_dict(throughput_results) + +# remapping the key, for visualization purpose +if not latency_results.empty: + latency_results = latency_results[list( + latency_column_mapping.keys())].rename(columns=latency_column_mapping) +if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename(columns=serving_column_mapping) +if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + +# get markdown tables +latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) +serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) +throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + +# document the result +with open(results_folder / "benchmark_results.md", "w") as f: + if not latency_results.empty: + f.write("## Latency tests\n") + f.write(latency_md_table) + f.write("\n") + if not throughput_results.empty: + f.write("## Throughput tests\n") + f.write(throughput_md_table) + f.write("\n") + if not serving_results.empty: + f.write("## Serving tests\n") + f.write(serving_md_table) + f.write("\n") diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh new file mode 100644 index 000000000..c785e6a0d --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -0,0 +1,17 @@ +#!/bin/sh +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" + +retries=0 +while [ $retries -lt 1000 ]; do + if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + exit 0 + fi + + echo "Waiting for image to be available..." + + retries=$((retries + 1)) + sleep 5 +done + +exit 1 \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/serving-tests.json new file mode 100644 index 000000000..bb6746612 --- /dev/null +++ b/.buildkite/nightly-benchmarks/serving-tests.json @@ -0,0 +1,59 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/throughput-tests.json new file mode 100644 index 000000000..db4f908d7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/throughput-tests.json @@ -0,0 +1,35 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 17edb7515..9937f8333 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,6 +10,7 @@ import torch from tqdm import tqdm from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -37,6 +38,7 @@ def main(args: argparse.Namespace): download_dir=args.download_dir, block_size=args.block_size, gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, distributed_executor_backend=args.distributed_executor_backend) sampling_params = SamplingParams( @@ -222,6 +224,29 @@ if __name__ == '__main__': help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') parser.add_argument( '--distributed-executor-backend', choices=['ray', 'mp'], diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4112a3272..df32b366c 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -499,6 +499,8 @@ def main(args: argparse.Namespace): # Save to file base_model_id = model_id.split("/")[-1] file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w") as outfile: @@ -639,6 +641,15 @@ if __name__ == "__main__": help="Specify directory to save benchmark json results." "If not specified, results are saved in the current directory.", ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 07b2f8541..463d9973d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,6 +10,7 @@ from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) +from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -81,6 +82,7 @@ def run_vllm( distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, download_dir: Optional[str] = None, + load_format: str = EngineArgs.load_format, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -102,6 +104,7 @@ def run_vllm( enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, + load_format=load_format, ) # Add the requests to the engine. @@ -228,7 +231,7 @@ def main(args: argparse.Namespace): args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.download_dir) + args.gpu_memory_utilization, args.download_dir, args.load_format) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -377,6 +380,29 @@ if __name__ == "__main__": help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model -- GitLab From d47af2bc0208d50ed36ae877876c1d2eafdc933a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Jun 2024 00:27:30 +0800 Subject: [PATCH 038/376] [CI/Build] Disable LLaVA-NeXT CPU test (#5529) --- .buildkite/run-cpu-test.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 6a86bc0eb..5f9ca5d75 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -19,6 +19,5 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf - bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" -- GitLab From 703475f6c2771600acc27eba76f6a750f54aae50 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 14 Jun 2024 12:30:15 -0400 Subject: [PATCH 039/376] [Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (#5516) --- .../quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp | 2 +- vllm/model_executor/layers/quantization/fp8.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 8f38bbf50..877a9f5b9 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast { CUTLASS_DEVICE void begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { - if (params.ptr_row == nullptr) { + if (!params.row_broadcast) { return; } diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index bc08bfcc3..e89fd6581 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,9 +257,7 @@ class Fp8LinearMethod(LinearMethodBase): # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - # Temporarily disable CUTLASS kernels due to an illegal memory access - #if bias is None and self.cutlass_fp8_supported: - if False: + if bias is None and self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ -- GitLab From d74674bbd978fad7f27a252650249bc2550f3e92 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Sat, 15 Jun 2024 00:47:44 +0800 Subject: [PATCH 040/376] [Misc] Fix arg names (#5524) --- benchmarks/kernels/benchmark_paged_attention.py | 2 +- examples/aqlm_example.py | 2 +- examples/fp8/extract_scales.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index e6f4e9e6b..a5355f4c1 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -165,7 +165,7 @@ if __name__ == '__main__': choices=["v1", "v2"], default="v2") parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--seq_len", type=int, default=4096) + parser.add_argument("--seq-len", type=int, default=4096) parser.add_argument("--num-query-heads", type=int, default=64) parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index e7c17fa03..3a63003ab 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -17,7 +17,7 @@ def main(): type=int, default=0, help='known good models by index, [0-4]') - parser.add_argument('--tensor_parallel_size', + parser.add_argument('--tensor-parallel-size', '-t', type=int, default=1, diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py index 1eb961a5a..e007a3bc0 100644 --- a/examples/fp8/extract_scales.py +++ b/examples/fp8/extract_scales.py @@ -327,7 +327,7 @@ if __name__ == "__main__": "--quantization-param-path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument( - "--quantized_model", + "--quantized-model", help="Specify the directory containing a single quantized HF model. " "It is expected that the quantization format is FP8_E4M3, for use " "on ROCm (AMD GPU).", @@ -339,18 +339,18 @@ if __name__ == "__main__": choices=["auto", "safetensors", "npz", "pt"], default="auto") parser.add_argument( - "--output_dir", + "--output-dir", help="Optionally specify the output directory. By default the " "KV cache scaling factors will be saved in the model directory, " "however you can override this behavior here.", default=None) parser.add_argument( - "--output_name", + "--output-name", help="Optionally specify the output filename.", # TODO: Change this once additional scaling factors are enabled default="kv_cache_scales.json") parser.add_argument( - "--tp_size", + "--tp-size", help="Optionally specify the tensor-parallel (TP) size that the " "quantized model should correspond to. If specified, during KV " "cache scaling factor extraction the observed TP size will be " -- GitLab From 15985680e2278610e873cc07ec72fa514ace72e9 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:01:46 -0400 Subject: [PATCH 041/376] [ Misc ] Rs/compressed tensors cleanup (#5432) Co-authored-by: mgoin Co-authored-by: Dipika Sikka --- .../compressed_tensors/compressed_tensors.py | 2 +- .../schemes/compressed_tensors_w4a16.py | 21 +++++++++++-------- .../compressed_tensors_w8a8_dynamictoken.py | 18 +++++++--------- .../compressed_tensors_w8a8_statictensor.py | 16 -------------- 4 files changed, 21 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c7f047845..e134a26ef 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -26,7 +26,7 @@ class CompressedTensorsConfig(QuantizationConfig): return [] def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.float16] + return [torch.float16, torch.bfloat16] # Need to figure it out def get_min_capability(self) -> int: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py index 90446a5ff..373458cff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py @@ -64,10 +64,9 @@ class CompressedTensorsW4A16(CompressedTensorsScheme): "input_dim": 1, "output_dim": 0, "packed_dim": 1, - "pack_factor": pack_factor + "pack_factor": pack_factor, + "weight_loader": weight_loader }) - set_weight_attrs(weight, {"weight_loader": weight_loader}) - layer.register_parameter("weight_packed", weight) weight_scale = Parameter( @@ -79,11 +78,12 @@ class CompressedTensorsW4A16(CompressedTensorsScheme): requires_grad=False, ) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - set_weight_attrs(weight_scale, { - "input_dim": weight_scale_dim, - "output_dim": 0 - }) + set_weight_attrs( + weight_scale, { + "weight_loader": weight_loader, + "input_dim": weight_scale_dim, + "output_dim": 0 + }) layer.register_parameter("weight_scale", weight_scale) # A 2D array defining the original shape of the weights @@ -92,7 +92,10 @@ class CompressedTensorsW4A16(CompressedTensorsScheme): requires_grad=False) layer.register_parameter("weight_shape", weight_shape) - set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) + set_weight_attrs(weight_shape, { + "weight_loader": weight_loader, + "ignore_warning": True, + }) layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 9bb7bf447..d514d7b28 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -48,9 +48,6 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): weight_scale_dim = sum( output_partition_sizes) if is_tensor_partitioned else 1 - weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) - weight_scale = Parameter(torch.empty(weight_scale_dim, dtype=torch.float32), requires_grad=False) @@ -61,21 +58,22 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): requires_grad=False) layer.register_parameter("weight", weight) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - set_weight_attrs(weight, {"weight_loader": weight_loader}) - set_weight_attrs(weight, {"logical_widths": output_partition_sizes}) + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + "logical_widths": output_partition_sizes + }) layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) set_weight_attrs( weight_scale, { + "weight_loader": weight_loader, "shard_splitter": self.scales_shard_splitter, "logical_widths": output_partition_sizes }) - layer.register_parameter("weight_zero_point", weight_zero_point) - set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader}) - def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight weight_scale = layer.weight_scale diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 88c15c5c2..414e17a06 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -39,22 +39,16 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - # TODO: remove zero_point parameters once the configs given remove them - is_tensor_partitioned = len(output_partition_sizes) != 1 weight_scale_dim = sum( output_partition_sizes) if is_tensor_partitioned else 1 input_scale = Parameter(torch.empty(1, dtype=torch.float32), requires_grad=False) - input_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) weight_scale = Parameter(torch.empty(weight_scale_dim, dtype=torch.float32), requires_grad=False) - weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), - requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -72,11 +66,6 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): "weight_loader": weight_loader, "ignore_warning": True, }) - layer.register_parameter("input_zero_point", input_zero_point) - set_weight_attrs(input_zero_point, { - "weight_loader": weight_loader, - "ignore_warning": True, - }) layer.register_parameter("weight_scale", weight_scale) set_weight_attrs( weight_scale, { @@ -85,11 +74,6 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): "logical_widths": output_partition_sizes, "ignore_warning": True, }) - layer.register_parameter("weight_zero_point", weight_zero_point) - set_weight_attrs(weight_zero_point, { - "weight_loader": weight_loader, - "ignore_warning": True - }) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight -- GitLab From 348616ac4b72e2acc6e9a60ae94cf0f7fc29ac31 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 14 Jun 2024 13:02:00 -0400 Subject: [PATCH 042/376] [Kernel] Suppress mma.sp warning on CUDA 12.5 and later (#5401) --- csrc/quantization/marlin/sparse/common/mma.h | 74 +++++++++++--------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 45ab67a78..8a6c65338 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -20,6 +20,19 @@ namespace marlin_24 { +// On CUDA earlier than 12.5, the ordered_metadata version of this instruction +// is not supported. On later versions of CUDA the version without ordered +// metadata results in the following warning: +// | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction +// | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially +// | reduced performance on some future architectures +#if defined CUDA_VERSION && CUDA_VERSION >= 12500 + #define MMA_SP_INST \ + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " +#else + #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " +#endif + // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32 // output/accumulation. __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, @@ -29,41 +42,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, const uint32_t* a1 = reinterpret_cast(&a_frag1); const uint32_t* b = reinterpret_cast(&frag_b); const uint32_t* e = reinterpret_cast(&frag_m); + float* c = reinterpret_cast(&frag_c); if (psel == 0) { - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), - "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), - "r"(e[0])); - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), - "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), - "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), + "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), + "f"(c[2]), "f"(c[3]), "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), + "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), + "f"(c[6]), "f"(c[7]), "r"(e[0])); } else { - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), - "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), - "r"(e[0])); - asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), - "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), - "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), + "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), + "f"(c[2]), "f"(c[3]), "r"(e[0])); + asm volatile(MMA_SP_INST + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), + "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), + "f"(c[6]), "f"(c[7]), "r"(e[0])); } } -- GitLab From 48f589e18b8b6758dbfb6bb23b2994430893b477 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 14 Jun 2024 10:02:23 -0700 Subject: [PATCH 043/376] [mis] fix flaky test of test_cuda_device_count_stateless (#5546) --- tests/distributed/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index b7ec59c7a..923ad66c2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -23,7 +23,8 @@ def test_cuda_device_count_stateless(): CUDA_VISIBLE_DEVICES is changed.""" actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() - assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1" + assert sorted(ray.get( + actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] assert ray.get(actor.get_count.remote()) == 2 ray.get(actor.set_cuda_visible_devices.remote("0")) assert ray.get(actor.get_count.remote()) == 1 -- GitLab From 77490c6f2f1e99982d2553832a42980bbdee820c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Jun 2024 01:04:42 +0800 Subject: [PATCH 044/376] [Core] Remove duplicate processing in async engine (#5525) --- vllm/engine/async_llm_engine.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 943402c86..03b6d03a9 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -580,21 +580,9 @@ class AsyncLLMEngine: if arrival_time is None: arrival_time = time.time() - if self.engine_use_ray: - processed_inputs = await self.engine.process_model_inputs_async \ - .remote( # type: ignore - request_id=request_id, - inputs=inputs, - lora_request=lora_request) - else: - processed_inputs = await self.engine.process_model_inputs_async( - request_id=request_id, - inputs=inputs, - lora_request=lora_request) - stream = self._request_tracker.add_request( request_id, - inputs=processed_inputs, + inputs=inputs, params=params, arrival_time=arrival_time, lora_request=lora_request, -- GitLab From d1c3d7d1398c26fa5afd4583a58fceca76555c2a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 14 Jun 2024 10:59:28 -0700 Subject: [PATCH 045/376] [misc][distributed] fix benign error in `is_in_the_same_node` (#5512) --- vllm/distributed/parallel_state.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f6a2fc9b0..16c5297af 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,8 +23,9 @@ import contextlib from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass -from multiprocessing import resource_tracker, shared_memory +from multiprocessing import shared_memory from typing import Any, Dict, List, Optional, Tuple, Union +from unittest.mock import patch import torch from torch.distributed import Backend, ProcessGroup @@ -744,7 +745,12 @@ def is_in_the_same_node(pg: ProcessGroup): src=ranks[0], group=pg) name = recv[0] - shm = shared_memory.SharedMemory(name=name) + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + shm = shared_memory.SharedMemory(name=name) if shm.buf[:len(magic_message)] == magic_message: is_in_the_same_node[rank] = 1 except Exception as e: @@ -757,14 +763,8 @@ def is_in_the_same_node(pg: ProcessGroup): # clean up the shared memory segment with contextlib.suppress(OSError): - if rank == 0: - if shm: - shm.unlink() - else: - if shm: - # fix to https://stackoverflow.com/q/62748654/9191338 - resource_tracker.unregister( - shm._name, "shared_memory") # type: ignore[attr-defined] + if rank == 0 and shm: + shm.unlink() torch.distributed.all_reduce(is_in_the_same_node, group=pg) return is_in_the_same_node.sum().item() == world_size -- GitLab From cdab68dcdb7a68b46b8138f73cdd6ac26ff6d9c0 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 14 Jun 2024 13:17:21 -0500 Subject: [PATCH 046/376] [Docs] Add ZhenFund as a Sponsor (#5548) --- README.md | 1 + docs/source/community/sponsors.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index ce2d8d5fd..c24768bf7 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a - Trainy - UC Berkeley - UC San Diego +- ZhenFund We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index c8f2c16d3..cd8e8b0f5 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a - Trainy - UC Berkeley - UC San Diego +- ZhenFund We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. -- GitLab From 6e2527a7cb94fa9154e34a42b95c1e4eb9a83e01 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Fri, 14 Jun 2024 14:27:57 -0400 Subject: [PATCH 047/376] [Doc] Update documentation on Tensorizer (#5471) --- docs/source/index.rst | 1 + docs/source/serving/tensorizer.rst | 12 ++++++++++++ vllm/engine/arg_utils.py | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 docs/source/serving/tensorizer.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index b7c0d5b88..f5d862759 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Documentation serving/env_vars serving/usage_stats serving/integrations + serving/tensorizer .. toctree:: :maxdepth: 1 diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst new file mode 100644 index 000000000..a44696507 --- /dev/null +++ b/docs/source/serving/tensorizer.rst @@ -0,0 +1,12 @@ +.. _tensorizer: + +Loading Models with CoreWeave's Tensorizer +========================================== +vLLM supports loading models with `CoreWeave's Tensorizer `_. +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the `vLLM example script `_. \ No newline at end of file diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 227de5475..ba53b5c86 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -230,7 +230,7 @@ class EngineArgs: '* "dummy" will initialize the weights with random values, ' 'which is mainly for profiling.\n' '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') -- GitLab From e2afb03c92a06700d296a2e7f6565d4a4f05168c Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 14 Jun 2024 22:28:11 +0200 Subject: [PATCH 048/376] [Bugfix] Enable loading FP8 checkpoints for gpt_bigcode models (#5460) Signed-off-by: Thomas Parnell --- vllm/model_executor/models/gpt_bigcode.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 69b75763e..b15ed1198 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -299,4 +299,10 @@ class GPTBigCodeForCausalLM(nn.Module): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method + if "c_attn.input_scale" in name or "c_attn.weight_scale" in name: + weight_loader(param, loaded_weight, 'q') + weight_loader(param, loaded_weight, 'k') + weight_loader(param, loaded_weight, 'v') + else: + weight_loader(param, loaded_weight) -- GitLab From 28c145eb5755902505c066dc3b1e5315572cc6e7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 14 Jun 2024 14:40:09 -0700 Subject: [PATCH 049/376] [Bugfix] Fix typo in Pallas backend (#5558) --- vllm/attention/backends/pallas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 75f246526..b203c5ec5 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -110,7 +110,7 @@ class PallasAttentionBackendImpl(AttentionImpl): raise NotImplementedError("TPU version must be 4 or higher.") self.megacore_mode = None - tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower() + tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower() if not tpu_type.endswith("lite"): if self.num_kv_heads % 2 == 0: self.megacore_mode = "kv_head" -- GitLab From f5bb85b435e6fe3db57fae1e25e09914015ef957 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 14 Jun 2024 14:47:45 -0700 Subject: [PATCH 050/376] [Core][Distributed] improve p2p cache generation (#5528) --- .../device_communicators/cuda_wrapper.py | 146 ++++++++++++ .../custom_all_reduce_utils.py | 215 ++++++++++-------- 2 files changed, 265 insertions(+), 96 deletions(-) create mode 100644 vllm/distributed/device_communicators/cuda_wrapper.py diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py new file mode 100644 index 000000000..24308235c --- /dev/null +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -0,0 +1,146 @@ +"""This file is a pure Python wrapper for the cudart library. +It avoids the need to compile a separate shared library, and is +convenient for use when we just need to call a few functions. +""" + +import ctypes +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +# this line makes it possible to directly load `libcudart.so` using `ctypes` +import torch # noqa + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +# === export types and functions from cudart to Python === +# for the original cudart definition, please check +# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +cudaError_t = ctypes.c_int +cudaMemcpyKind = ctypes.c_int + + +class cudaIpcMemHandle_t(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +@dataclass +class Function: + name: str + restype: Any + argtypes: List[Any] + + +class CudaRTLibrary: + exported_functions = [ + # ​cudaError_t cudaSetDevice ( int device ) + Function("cudaSetDevice", cudaError_t, [ctypes.c_int]), + # cudaError_t cudaDeviceSynchronize ( void ) + Function("cudaDeviceSynchronize", cudaError_t, []), + # ​cudaError_t cudaDeviceReset ( void ) + Function("cudaDeviceReset", cudaError_t, []), + + # const char* cudaGetErrorString ( cudaError_t error ) + Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]), + + # ​cudaError_t cudaMalloc ( void** devPtr, size_t size ) + Function("cudaMalloc", cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]), + # ​cudaError_t cudaFree ( void* devPtr ) + Function("cudaFree", cudaError_t, [ctypes.c_void_p]), + # ​cudaError_t cudaMemset ( void* devPtr, int value, size_t count ) + Function("cudaMemset", cudaError_t, + [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]), + # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa + Function("cudaMemcpy", cudaError_t, [ + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind + ]), + + # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa + Function("cudaIpcGetMemHandle", cudaError_t, + [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]), + # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa + Function("cudaIpcOpenMemHandle", cudaError_t, [ + ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint + ]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: Dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + + def __init__(self, so_file: Optional[str] = None): + if so_file is None: + assert torch.version.cuda is not None + major_version = torch.version.cuda.split(".")[0] + so_file = f"libcudart.so.{major_version}" + if so_file not in CudaRTLibrary.path_to_library_cache: + lib = ctypes.CDLL(so_file) + CudaRTLibrary.path_to_library_cache[so_file] = lib + self.lib = CudaRTLibrary.path_to_library_cache[so_file] + + if so_file not in CudaRTLibrary.path_to_dict_mapping: + _funcs = {} + for func in CudaRTLibrary.exported_functions: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs + self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file] + + def CUDART_CHECK(self, result: cudaError_t) -> None: + if result != 0: + error_str = self.cudaGetErrorString(result) + raise RuntimeError(f"CUDART error: {error_str}") + + def cudaGetErrorString(self, error: cudaError_t) -> str: + return self.funcs["cudaGetErrorString"](error).decode("utf-8") + + def cudaSetDevice(self, device: int) -> None: + self.CUDART_CHECK(self.funcs["cudaSetDevice"](device)) + + def cudaDeviceSynchronize(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]()) + + def cudaDeviceReset(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceReset"]()) + + def cudaMalloc(self, size: int) -> ctypes.c_void_p: + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size)) + return devPtr + + def cudaFree(self, devPtr: ctypes.c_void_p) -> None: + self.CUDART_CHECK(self.funcs["cudaFree"](devPtr)) + + def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, + count: int) -> None: + self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count)) + + def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p, + count: int) -> None: + cudaMemcpyDefault = 4 + kind = cudaMemcpyDefault + self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind)) + + def cudaIpcGetMemHandle(self, + devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t: + handle = cudaIpcMemHandle_t() + self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"]( + ctypes.byref(handle), devPtr)) + return handle + + def cudaIpcOpenMemHandle(self, + handle: cudaIpcMemHandle_t) -> ctypes.c_void_p: + cudaIpcMemLazyEnablePeerAccess = 1 + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"]( + ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)) + return devPtr diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index c9573edb0..e6957b119 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,87 +1,98 @@ +import ctypes import json import os -import sys -import tempfile -import time -from contextlib import contextmanager -from typing import Callable, Dict, List, Optional +from itertools import product +from typing import Dict, Optional, Sequence -import torch import torch.distributed as dist import torch.multiprocessing as mp import vllm.envs as envs +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.logger import init_logger from vllm.utils import cuda_device_count_stateless logger = init_logger(__name__) -@contextmanager -def mute_output(): - with open(os.devnull, "w") as f: - sys.stderr = f - sys.stdout = f - yield - - -def producer(i: int, - init_method: str, +def producer(batch_src: Sequence[int], + producer_queue, + consumer_queue, + result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices - with mute_output(): - dist.init_process_group( - backend="gloo", - init_method=init_method, - world_size=2, - rank=0, - ) - # produce a tensor in GPU i - data = torch.zeros((128, ), device=f"cuda:{i}") - # get the information to reconstruct the shared tensor - func, args = torch.multiprocessing.reductions.reduce_tensor(data) - args = list(args) - dist.broadcast_object_list([(func, args)], src=0) - dist.barrier() - torch.cuda.synchronize() - assert torch.all(data == 1).item() - - -def consumer(j: int, - init_method: str, + + lib = CudaRTLibrary() + for i in batch_src: + lib.cudaSetDevice(i) + pointer = lib.cudaMalloc(1024) + lib.cudaMemset(pointer, 1, 1024) + lib.cudaDeviceSynchronize() + handle = lib.cudaIpcGetMemHandle(pointer) + producer_queue.put(handle) + open_success = consumer_queue.get() + if open_success: + # use two queues to simulate barrier + producer_queue.put(0) + consumer_queue.get() + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def consumer(batch_tgt: Sequence[int], + producer_queue, + consumer_queue, + result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices - with mute_output(): - dist.init_process_group( - backend="gloo", - init_method=init_method, - world_size=2, - rank=1, - ) - torch.cuda.set_device(j) - recv = [None] - dist.broadcast_object_list(recv, src=0) - func: Callable - args: List - func, args = recv[0] # type: ignore - # `args[6]` is the device id - # by default pytorch will use `i` from the producer - # here we need to set it to `j` to test P2P access - args[6] = j - data = func(*args) - data += 1 - dist.barrier() - torch.cuda.synchronize() - assert torch.all(data == 1).item() - - -def can_actually_p2p(i, j): + + lib = CudaRTLibrary() + for j in batch_tgt: + lib.cudaSetDevice(j) + handle = producer_queue.get() + open_success = False + try: + pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore + open_success = True + except RuntimeError: + # cannot error out here, because the producer process + # is still waiting for the response. + pass + consumer_queue.put(open_success) + if open_success: + # modify the memory + lib.cudaMemset(pointer, 2, 1024) + # use two queues to simulate barrier + producer_queue.get() + consumer_queue.put(0) + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def can_actually_p2p( + batch_src: Sequence[int], + batch_tgt: Sequence[int], +): """ Usually, checking if P2P access is enabled can be done by - `torch.cuda.can_device_access_peer(i, j)`. However, sometimes - the driver might be broken, and `torch.cuda.can_device_access_peer(i, j)` + `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes + the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)` returns `True` even if P2P access is not actually possible. See https://github.com/vllm-project/vllm/issues/2728 and https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10 @@ -90,41 +101,50 @@ def can_actually_p2p(i, j): Note on p2p and cuda IPC: Usually, one process uses one GPU: - GPU i --> cuda context i --> tensor i --> process i + GPU src --> cuda context src --> tensor src --> process src We need to combine p2p and cuda IPC, so that: - GPU i --> cuda context i --> tensor i --> process i - |shared| - GPU j --> cuda context j --> tensor j --> process j - That is to say, process i creates a tensor in GPU i, passes IPC handle to - process j, and process j accesses the tensor in GPU j. Any operation on the - tensor in process j will be reflected in the tensor in process i, because + GPU src --> cuda context src --> tensor src --> process src + |shared| + GPU tgt --> cuda context tgt --> tensor tgt --> process tgt + That is to say, process src creates a tensor in GPU src, passes IPC handle to + process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the + tensor in process tgt will be reflected in the tensor in process src, because they are the same memory segment. - It is important to note that process j accesses the tensor in GPU j, not - GPU i. That's why we need p2p access. # noqa - """ + It is important to note that process tgt accesses the tensor in GPU tgt, not + GPU src. That's why we need p2p access. + + The most time-consuming part is the process creation. To avoid creating + processes for every pair of GPUs, we use batched testing. We create two + processes for testing all pairs of GPUs in batch. The trick is to reset + the device after each test (which is not available in PyTorch). + """ # noqa cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) # pass the CUDA_VISIBLE_DEVICES to the child process # to make sure they see the same set of GPUs - # make sure the temp file is not the same across different calls - temp_path = tempfile.mktemp() + str(time.time()) - # create an empty file - with open(temp_path, "w"): - pass - init_method = f"file://{temp_path}" - # make sure the processes are spawned smp = mp.get_context("spawn") - pi = smp.Process(target=producer, - args=(i, init_method, cuda_visible_devices)) - pj = smp.Process(target=consumer, - args=(j, init_method, cuda_visible_devices)) - pi.start() - pj.start() - pi.join() - pj.join() - return pi.exitcode == 0 and pj.exitcode == 0 + producer_queue = smp.Queue() + consumer_queue = smp.Queue() + result_queue = smp.Queue() + p_src = smp.Process(target=producer, + args=(batch_src, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_tgt = smp.Process(target=consumer, + args=(batch_tgt, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_src.start() + p_tgt.start() + p_src.join() + p_tgt.join() + result = [] + for src, tgt in zip(batch_src, batch_tgt): + a = result_queue.get() + b = result_queue.get() + assert a == b + result.append(a) + return result # why do we need this cache? @@ -142,14 +162,14 @@ def can_actually_p2p(i, j): _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None -def gpu_p2p_access_check(i: int, j: int) -> bool: - """Check if GPU i can access GPU j.""" +def gpu_p2p_access_check(src: int, tgt: int) -> bool: + """Check if GPU src can access GPU tgt.""" # if the cache variable is already calculated, # read from the cache instead of checking it again global _gpu_p2p_access_cache if _gpu_p2p_access_cache is not None: - return _gpu_p2p_access_cache[f"{i}->{j}"] + return _gpu_p2p_access_cache[f"{src}->{tgt}"] is_distributed = dist.is_initialized() @@ -169,9 +189,12 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) cache = {} - for _i in range(num_dev): - for _j in range(num_dev): - cache[f"{_i}->{_j}"] = can_actually_p2p(_i, _j) + ids = list(range(num_dev)) + # batch of all pairs of GPUs + batch_src, batch_tgt = zip(*list(product(ids, ids))) + result = can_actually_p2p(batch_src, batch_tgt) + for _i, _j, r in zip(batch_src, batch_tgt, result): + cache[f"{_i}->{_j}"] = r with open(path, "w") as f: json.dump(cache, f, indent=4) if is_distributed: @@ -180,7 +203,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: with open(path, "r") as f: cache = json.load(f) _gpu_p2p_access_cache = cache - return _gpu_p2p_access_cache[f"{i}->{j}"] + return _gpu_p2p_access_cache[f"{src}->{tgt}"] __all__ = ["gpu_p2p_access_check"] -- GitLab From bd7efe95d03773c65fa7dc1e122f3ce0e079a542 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 14 Jun 2024 19:18:22 -0500 Subject: [PATCH 051/376] Add ccache to amd (#5555) --- .buildkite/test-template-aws.j2 | 1 + Dockerfile.rocm | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 09649b625..01f7ff1e0 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -30,6 +30,7 @@ steps: command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" env: DOCKER_BUILDKIT: "1" + priority: 100 soft_fail: true {% endif %} {% endfor %} diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 954958df8..724fa1673 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -42,6 +42,7 @@ RUN apt-get update && apt-get install -y \ unzip \ nvidia-cuda-toolkit \ tmux \ + ccache \ && rm -rf /var/lib/apt/lists/* ### Mount Point ### @@ -102,7 +103,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so -RUN --mount=type=cache,target=/root/.cache/pip \ +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ && python3 setup.py install \ -- GitLab From 1b8a0d71cf5aa1a43c14478ec90538c3fbe1b315 Mon Sep 17 00:00:00 2001 From: leiwen83 Date: Sat, 15 Jun 2024 08:23:56 +0800 Subject: [PATCH 052/376] [Core][Bugfix]: fix prefix caching for blockv2 (#5364) Signed-off-by: Lei Wen Co-authored-by: Lei Wen --- tests/core/block/e2e/test_correctness.py | 67 ++++++++++++++++++++++++ vllm/core/block/prefix_caching_block.py | 7 ++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index ad253635e..8502eab0f 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, assert expected_token_ids == actual_token_ids assert baseline_token_ids == test_token_ids + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + + # we keep the blocks small, so that hit eviction quickly + "max_model_len": 48, + "block_size": 16, + "num_gpu_blocks_override": 3, + + # Test APC in v2 block + "use_v2_block_manager": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{ + "enable_prefix_caching": False +}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "enable_prefix_caching": True, +}]) +@pytest.mark.parametrize("seed", [1]) +def test_auto_prefix_caching_after_evition_start(baseline_llm_generator, + test_llm_generator): + """Verify block manager v2 with auto prefix caching could works normal + even when eviction started. + With APC enabled, all blocks are held by native block at the beginning. + Then blocks are managed by evictor instead. If cache hit at the evitor's + block, then it could be reused, or we need to recompute its kv cache. + """ + output_len = 10 + temperature = 0.0 + + prompts = [ + "You are a helpful assistant. Please answer truthfully and write " + "out your thinking step by step to be sure you get the right answer. " + "If you make a mistake, attempt to correct it. who are you?", + "You are a helpful assistant. Please answer truthfully and write out " + "your thinking step by step to be sure you get the right answer. You " + "are helpful and harmless and you follow ethical guidelines. " + "who are you?" + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + print('Getting token ids with APC disabled') + baseline_token_ids = get_token_ids_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) + + print('Getting token ids with APC enabled') + test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, sampling_params) + + for expected_token_ids, actual_token_ids in zip(baseline_token_ids, + test_token_ids): + assert expected_token_ids == actual_token_ids + + assert baseline_token_ids == test_token_ids diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 405e97056..88dbbfb2f 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -176,14 +176,17 @@ class PrefixCachingBlockAllocator(BlockAllocator): self._refcounter.incr(block_id) - # the block comes from evictor already contain computed result + # Now this block is pop from evictor and ready to write + # with new content which most probably different with + # original content. So need to tell worker to recompute + # its kvcache block = self._create_block( prev_block=prev_block, token_ids=[], block_size=self._block_size, allocator=self, block_id=block_id, - computed=True, + computed=False, ) assert block.content_hash is None -- GitLab From 0e9164b40abdb30f1929edb44b56894c9e26c31d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Jun 2024 12:45:31 +0800 Subject: [PATCH 053/376] [mypy] Enable type checking for test directory (#5017) --- .github/workflows/mypy.yaml | 2 +- benchmarks/benchmark_serving.py | 18 +++---- benchmarks/benchmark_throughput.py | 4 +- benchmarks/kernels/benchmark_aqlm.py | 10 ++-- benchmarks/kernels/benchmark_marlin.py | 8 +-- benchmarks/kernels/benchmark_moe.py | 26 +++++++--- .../kernels/benchmark_paged_attention.py | 11 ++-- benchmarks/kernels/benchmark_rope.py | 7 +-- examples/fp8/extract_scales.py | 12 ++--- examples/offline_inference_distributed.py | 8 +-- format.sh | 2 +- tests/core/block/test_block_table.py | 8 +-- tests/core/block/test_prefix_caching_block.py | 4 +- tests/core/test_chunked_prefill_scheduler.py | 10 ++-- tests/core/test_scheduler.py | 52 +++++++++---------- tests/core/utils.py | 12 +++-- tests/distributed/test_pynccl.py | 5 +- tests/distributed/test_utils.py | 5 +- tests/entrypoints/test_openai_server.py | 5 +- tests/kernels/test_attention.py | 33 ++++++------ tests/kernels/test_blocksparse_attention.py | 22 ++++---- tests/kernels/test_cache.py | 32 ++++++------ tests/kernels/test_cutlass.py | 4 +- tests/kernels/test_flash_attn.py | 4 +- tests/kernels/test_pos_encoding.py | 28 +++++----- tests/lora/conftest.py | 21 ++++++-- tests/lora/data/long_context_test_data.py | 24 ++++++++- tests/lora/test_baichuan.py | 6 ++- tests/lora/test_chatglm3.py | 6 ++- tests/lora/test_gemma.py | 6 ++- tests/lora/test_layer_variation.py | 6 +-- tests/lora/test_layers.py | 23 ++++---- tests/lora/test_llama.py | 6 ++- tests/lora/test_long_context.py | 15 +++--- tests/lora/test_lora_checkpoints.py | 4 +- tests/lora/test_lora_manager.py | 6 +-- tests/lora/test_mixtral.py | 6 ++- tests/lora/test_phi.py | 6 ++- tests/lora/test_quant_model.py | 7 ++- tests/lora/utils.py | 18 +++---- tests/models/test_fp8.py | 3 +- tests/prefix_caching/test_prefix_caching.py | 5 +- tests/quantization/test_configs.py | 3 +- tests/samplers/test_logprobs.py | 11 ++-- tests/samplers/test_rejection_sampler.py | 4 +- tests/samplers/test_sampler.py | 41 ++++++++------- tests/spec_decode/e2e/conftest.py | 13 ++--- tests/spec_decode/test_batch_expansion.py | 6 ++- tests/spec_decode/test_multi_step_worker.py | 19 ++++--- tests/spec_decode/test_spec_decode_worker.py | 17 ++++-- tests/spec_decode/utils.py | 14 +++-- tests/test_cache_block_hashing.py | 2 +- tests/test_logger.py | 1 + tests/tokenization/test_detokenize.py | 4 +- tests/utils.py | 2 +- tests/worker/test_model_runner.py | 23 ++++---- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 4 +- vllm/core/block/block_table.py | 2 +- vllm/core/block/naive_block.py | 2 +- vllm/core/block/prefix_caching_block.py | 2 +- vllm/core/block_manager_v2.py | 2 +- .../custom_all_reduce_utils.py | 8 +-- .../device_communicators/pynccl_wrapper.py | 2 +- vllm/engine/llm_engine.py | 4 +- vllm/engine/metrics.py | 4 +- vllm/engine/output_processor/single_step.py | 6 +-- vllm/entrypoints/openai/run_batch.py | 3 +- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/lora/lora.py | 3 +- vllm/lora/worker_manager.py | 2 +- vllm/model_executor/layers/linear.py | 2 +- .../layers/quantization/gptq_marlin.py | 11 ++-- .../quantization/utils/marlin_24_perms.py | 18 ++++--- .../layers/quantization/utils/marlin_perms.py | 18 ++++--- vllm/model_executor/layers/sampler.py | 25 +++++---- vllm/model_executor/model_loader/loader.py | 7 +-- .../model_loader/weight_utils.py | 2 +- vllm/model_executor/models/__init__.py | 4 +- vllm/model_executor/models/arctic.py | 4 +- vllm/model_executor/models/commandr.py | 4 +- vllm/model_executor/models/gemma.py | 4 +- vllm/sequence.py | 2 +- vllm/spec_decode/multi_step_worker.py | 10 ++-- vllm/spec_decode/ngram_worker.py | 6 +-- vllm/spec_decode/spec_decode_worker.py | 8 +-- vllm/spec_decode/util.py | 4 +- vllm/transformers_utils/detokenizer.py | 2 +- vllm/utils.py | 38 ++++++++------ vllm/worker/model_runner.py | 4 +- vllm/worker/worker_base.py | 4 +- 92 files changed, 510 insertions(+), 379 deletions(-) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e6c2ef0..62f0dbcd9 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -47,5 +47,5 @@ jobs: mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml - mypy vllm/model_executor --config-file pyproject.toml + mypy tests --config-file pyproject.toml diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index df32b366c..c136ee572 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,7 +31,7 @@ import time import warnings from dataclasses import dataclass from datetime import datetime -from typing import AsyncGenerator, List, Optional, Tuple +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple import numpy as np from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, @@ -200,12 +200,12 @@ def calculate_metrics( dur_s: float, tokenizer: PreTrainedTokenizerBase, ) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens = [] + actual_output_lens: List[int] = [] total_input = 0 completed = 0 - itls = [] - tpots = [] - ttfts = [] + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -265,7 +265,7 @@ async def benchmark( disable_tqdm: bool, ): if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS.get(backend) + request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") @@ -292,7 +292,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) benchmark_start_time = time.perf_counter() - tasks = [] + tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request request_func_input = RequestFuncInput( @@ -310,7 +310,7 @@ async def benchmark( pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - if not disable_tqdm: + if pbar is not None: pbar.close() benchmark_duration = time.perf_counter() - benchmark_start_time @@ -466,7 +466,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json = {} + result_json: Dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 463d9973d..48dfce428 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -108,8 +108,8 @@ def run_vllm( ) # Add the requests to the engine. - prompts = [] - sampling_params = [] + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] for prompt, _, output_len in requests: prompts.append(prompt) sampling_params.append( diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 59392947b..ac6a9f297 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -86,9 +86,9 @@ def dequant_no_scale( # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against # the generic pytorch version. # Just visual comparison. -def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') @@ -204,7 +204,7 @@ def main(): sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): # I didn't see visible improvements from increasing these, but feel free :) @@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, print('') -def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, +def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method) -> float: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b77191178..96f01967b 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ import argparse +from typing import List import torch import torch.utils.benchmark as benchmark @@ -23,8 +24,9 @@ ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] -def bench_run(results, model, act_order, is_k_full, num_bits, group_size, - size_m, size_k, size_n): +def bench_run(results: List[benchmark.Measurement], model: str, + act_order: bool, is_k_full: bool, num_bits: int, group_size: int, + size_m: int, size_k: int, size_n: int): label = "Quant Matmul" sub_label = ("{}, act={} k_full={}, b={}, g={}, " @@ -156,7 +158,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results = [] + results: List[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index be5dd32bd..62347aaf8 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,7 +1,7 @@ import argparse import time from datetime import datetime -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, TypedDict import ray import torch @@ -12,8 +12,17 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + def benchmark_config( - config: Dict[str, int], + config: BenchmarkConfig, num_tokens: int, num_experts: int, shard_intermediate_size: int, @@ -92,7 +101,7 @@ def benchmark_config( start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies = [] + latencies: List[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]: # Reduced search space for faster tuning. # TODO(woosuk): Increase the search space and use a performance model to # prune the search space. - configs = [] + configs: List[BenchmarkConfig] = [] for num_stages in [2, 3, 4, 5]: for block_m in [16, 32, 64, 128, 256]: for block_k in [64, 128, 256]: @@ -175,8 +184,8 @@ class BenchmarkWorker: topk: int, dtype: torch.dtype, use_fp8: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: List[BenchmarkConfig], + ) -> BenchmarkConfig: best_config = None best_time = float("inf") for config in tqdm(search_space): @@ -199,10 +208,11 @@ class BenchmarkWorker: best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None return best_config -def sort_config(config: Dict[str, int]) -> Dict[str, int]: +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], @@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]: def save_configs( - configs: Dict[int, Dict[str, int]], + configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index a5355f4c1..687e2369b 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,7 +1,7 @@ import argparse import random import time -from typing import Optional +from typing import List, Optional import torch @@ -54,14 +54,17 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device=device) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, + dtype=torch.int, + device=device) # Create the KV cache. key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 00e55f606..a53c6c77a 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,11 +1,12 @@ import argparse from itertools import accumulate -from typing import Optional +from typing import List, Optional import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, + get_rope) def benchmark_rope_kernels_multi_lora( @@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes = [] + non_batched_ropes: List[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py index e007a3bc0..1dce9d7e9 100644 --- a/examples/fp8/extract_scales.py +++ b/examples/fp8/extract_scales.py @@ -2,7 +2,7 @@ import argparse import glob import json import os -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import numpy as np import torch @@ -19,7 +19,7 @@ def _prepare_hf_weights( quantized_model_dir: str, load_format: str = "auto", fall_back_to_pt: bool = True, -) -> Tuple[str, List[str], bool]: +) -> Tuple[List[str], bool]: if not os.path.isdir(quantized_model_dir): raise FileNotFoundError( f"The quantized model directory `{quantized_model_dir}` " @@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def _kv_scales_extractor( - hf_tensor_files: Iterable[str], + hf_tensor_files: List[str], use_safetensors: bool, rank_keyword: str = "rank", expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: @@ -115,7 +115,7 @@ def _kv_scales_extractor( for char in rank_keyword: assert not char.isdecimal( ), f"Rank keyword {rank_keyword} contains a numeric character!" - rank_scales_map = {} + rank_scales_map: Dict[int, Dict[int, float]] = {} for tensor_file in hf_tensor_files: try: rank_idx = tensor_file.find(rank_keyword) @@ -141,7 +141,7 @@ def _kv_scales_extractor( raise if rank not in rank_scales_map: - layer_scales_map = {} + layer_scales_map: Dict[int, float] = {} rank_scales_map[rank] = layer_scales_map else: raise RuntimeError( @@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str, "does not exist.") metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - result = {} + result: Dict[str, Any] = {} for file in metadata_files: with open(file) as f: try: diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py index 1e59e8950..677127844 100644 --- a/examples/offline_inference_distributed.py +++ b/examples/offline_inference_distributed.py @@ -5,7 +5,7 @@ distributively on a multi-nodes cluster. Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from typing import Dict +from typing import Any, Dict, List import numpy as np import ray @@ -40,8 +40,8 @@ class LLMPredictor: # The output is a list of RequestOutput objects that contain the prompt, # generated text, and other information. outputs = self.llm.generate(batch["text"], sampling_params) - prompt = [] - generated_text = [] + prompt: List[str] = [] + generated_text: List[str] = [] for output in outputs: prompt.append(output.prompt) generated_text.append(' '.join([o.text for o in output.outputs])) @@ -71,7 +71,7 @@ def scheduling_strategy_fn(): pg, placement_group_capture_child_tasks=True)) -resources_kwarg = {} +resources_kwarg: Dict[str, Any] = {} if tensor_parallel_size == 1: # For tensor_parallel_size == 1, we simply set num_gpus=1. resources_kwarg["num_gpus"] = 1 diff --git a/format.sh b/format.sh index 2fd6af03b..8c54b5630 100755 --- a/format.sh +++ b/format.sh @@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml -mypy vllm/model_executor --config-file pyproject.toml +mypy tests --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 6fb95cfdf..496774c8d 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from vllm.core.block.block_table import BlockTable @@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): token_ids = list(range(sequence_len)) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - block_tables = [] + block_tables: List[BlockTable] = [] for i in range(5): assert allocator.get_num_free_blocks( device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc @@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): num_immutable_blocks_per_alloc = len( chunked_tokens) - num_mutable_blocks_per_alloc - block_tables = [] + block_tables: List[BlockTable] = [] for alloc_i in range(1, 6): block_tables.append( @@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ) block_table.allocate(token_ids=token_ids, device=Device.GPU) - appended_so_far = [] + appended_so_far: List[int] = [] for append in chunk_list(token_ids_to_append, append_size): block_table.append_token_ids(append) appended_so_far.extend(append) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bcf08cda0..fcf32cbe9 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -123,7 +123,7 @@ class TestPrefixCachingBlock: num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[PrefixCachingBlock] = [] num_blocks = math.ceil( len(token_ids) / block_size) + num_empty_trailing_blocks @@ -608,7 +608,7 @@ class TestPrefixCachingBlockAllocator: ) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[Block] = [] num_blocks = math.ceil(len(token_ids) / block_size) if num_blocks == 0: diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index f68482cc0..a3b76327e 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -483,11 +483,11 @@ def test_chunked_prefill_preempt(): # The request should be preempted. scheduler.block_manager.can_append_slots = MagicMock() - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group1(seq_group, num_lookahead_slots): return seq_group.request_id != "1" scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group1) # The running prefill is now preempted. _, out = schedule_and_update_computed_tokens(scheduler) @@ -505,11 +505,11 @@ def test_chunked_prefill_preempt(): assert seq_group.get_num_uncomputed_tokens() == 30 # We should be able to run prefill twice as it is chunked. - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group2(seq_group, num_lookahead_slots): return True scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group2) _, out = schedule_and_update_computed_tokens(scheduler) assert len(out.scheduled_seq_groups) == 1 assert out.num_prefill_groups == 1 @@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running = [] + running: List[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=65) scheduler.add_seq_group(seq_group) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 07fc8731e..bae958211 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,6 +1,6 @@ import time from collections import deque -from typing import List +from typing import Deque, List, Set, Tuple from unittest.mock import MagicMock import pytest # noqa @@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group(): # Add multiple seq groups to scheduler. num_seq_group = 4 - request_ids = set() + request_ids: Set[str] = set() for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), block_size) scheduler.add_seq_group(seq_group) @@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len(): Test prompt longer than max_prompt_len is aborted. """ scheduler = initialize_scheduler(max_model_len=30) - _, seq_group = create_dummy_prompt(0, prompt_length=60) + _, seq_group = create_dummy_prompt("0", prompt_length=60) waiting = deque([seq_group]) budget = create_token_budget() remaining_waiting, output = scheduler._schedule_prefills( @@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget(): Test token budget respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=0) for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs(): Test max seq respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(max_num_seqs=2) for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora(): """ lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=120) - curr_loras = set() + curr_loras: Set[int] = set() for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity(): Test sequence cannot be scheduled due to block manager has no capacity. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget() for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -536,7 +536,7 @@ def test_decode_schedule_preempted(): Test decodes cannot be scheduled and preempted. """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None for i in range(3): @@ -577,7 +577,7 @@ def test_decode_swap_beam_search(): Test best_of > 1 swap out blocks """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None budget = create_token_budget() @@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update(): """ scheduler = initialize_scheduler() _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None scheduler._allocate_and_set_running(seq_group) @@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update(): def test_schedule_swapped_simple(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) @@ -683,10 +683,10 @@ def test_schedule_swapped_simple(): def test_schedule_swapped_max_token_budget(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget(): def test_schedule_swapped_max_seqs(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(4): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) scheduler._allocate_and_set_running(seq_group) @@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs(): def test_schedule_swapped_max_loras(): lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") - curr_loras = set() - blocks_to_swap_out = [] + curr_loras: Set[int] = set() + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras(): def test_schedule_swapped_cannot_swap_in(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in(): def test_infeasible_swap(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -834,13 +834,13 @@ def test_infeasible_swap(): def test_schedule_swapped_blocks_to_copy(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] scheduler._swap_out(seq_group, blocks_to_swap_out) swapped.append(seq_group) diff --git a/tests/core/utils.py b/tests/core/utils.py index 2fbf099c5..f249f4b59 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,7 @@ import time -from typing import Iterable, Optional, Tuple +from typing import List, Optional +from typing import Sequence as GenericSequence +from typing import Tuple from vllm import SamplingParams from vllm.lora.request import LoRARequest @@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder( lora_request: Optional[LoRARequest] = None, use_beam_search: bool = False, best_of: int = 1, -) -> Tuple[Sequence, SequenceGroup]: +) -> Tuple[Sequence, Sequence, SequenceGroup]: if not block_size: block_size = decoder_prompt_length @@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder( def create_seq_group( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: @@ -98,7 +100,7 @@ def create_seq_group( prompt_token_ids = [0] * seq_prompt_len - seqs = [] + seqs: List[Sequence] = [] for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, @@ -125,7 +127,7 @@ def create_seq_group( def create_seq_group_encoder_decoder( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index b788e253a..964dbc542 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,5 +1,6 @@ import multiprocessing import os +from typing import Dict, List import pytest import torch @@ -17,9 +18,9 @@ from vllm.utils import update_environment_variables def distributed_run(fn, world_size): number_of_processes = world_size - processes = [] + processes: List[multiprocessing.Process] = [] for i in range(number_of_processes): - env = {} + env: Dict[str, str] = {} env['RANK'] = str(i) env['LOCAL_RANK'] = str(i) env['WORLD_SIZE'] = str(number_of_processes) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 923ad66c2..49d11daca 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -6,7 +6,7 @@ from vllm.utils import cuda_device_count_stateless @ray.remote -class _CUDADeviceCountStatelessTestActor(): +class _CUDADeviceCountStatelessTestActor: def get_count(self): return cuda_device_count_stateless() @@ -22,7 +22,8 @@ def test_cuda_device_count_stateless(): """Test that cuda_device_count_stateless changes return value if CUDA_VISIBLE_DEVICES is changed.""" - actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() + actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore + num_gpus=2).remote() assert sorted(ray.get( actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] assert ray.get(actor.get_count.remote()) == 2 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 2d7e3044d..d66b9b0fd 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1,6 +1,7 @@ # imports for guided decoding tests import json import re +from typing import List import jsonschema import openai # use the official client for correctness check @@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.0, stream=True, ) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 8bc4766fc..f848ad51c 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -157,14 +157,15 @@ def test_paged_attention( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, dtype=torch.int) # Create the KV caches. key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, @@ -283,7 +284,7 @@ def ref_multi_query_kv_attention( dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 - ref_outputs = [] + ref_outputs: List[torch.Tensor] = [] for i in range(num_seqs): start_idx = cu_seq_lens[i] end_idx = cu_seq_lens[i + 1] @@ -303,8 +304,8 @@ def ref_multi_query_kv_attention( attn_mask=attn_mask, ) ref_outputs.append(ref_output) - ref_output = torch.cat(ref_outputs, dim=0) - return ref_output + + return torch.cat(ref_outputs, dim=0) # TODO(woosuk): Add tests for USE_ALIBI=True. diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 9da13ca6e..402545d19 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill( value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) ref_output = ref_multi_query_kv_attention( - cu_seq_lens, + cu_seq_lens.tolist(), query, key, value, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 29572cfa5..23b6baa60 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,5 +1,5 @@ import random -from typing import Tuple +from typing import List, Tuple import pytest import torch @@ -63,7 +63,7 @@ def test_copy_blocks( src_blocks = random.sample(range(num_blocks), num_mappings) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping = [] + block_mapping: List[Tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] dst1 = dst_blocks[2 * i] @@ -131,8 +131,8 @@ def test_reshape_and_cache( torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long) qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) _, key, value = qkv.unbind(dim=1) @@ -170,12 +170,12 @@ def test_reshape_and_cache( # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies = block_indicies.cpu().tolist() + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] @@ -224,8 +224,10 @@ def test_reshape_and_cache_flash( # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, + dtype=torch.long, + device=device) qkv = torch.randn(num_tokens, 3, @@ -257,13 +259,13 @@ def test_reshape_and_cache_flash( slot_mapping, kv_cache_dtype) # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') - block_indicies = block_indicies.cpu().tolist() + block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, block_offset, :, :] = key[i] cloned_value_cache[block_idx, block_offset, :, :] = value[i] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 777138ace..4d09cd8ce 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -17,13 +17,13 @@ capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.tensor): +def to_fp8(tensor: torch.Tensor): finfo = torch.finfo(torch.float8_e4m3fn) return torch.round(tensor.clamp( min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) -def to_int8(tensor: torch.tensor): +def to_int8(tensor: torch.Tensor): return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 22772d4ea..cd06c2717 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -25,7 +25,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs = [] + outputs: List[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -70,7 +70,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @torch.inference_mode def test_flash_attn_with_paged_kv( - kv_lens: List[Tuple[int, int]], + kv_lens: List[int], num_heads: Tuple[int, int], head_size: int, dtype: torch.dtype, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index e564e3251..4c8365992 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,5 +1,5 @@ from itertools import accumulate, product -from typing import List, Optional +from typing import Dict, List, Optional import pytest import torch @@ -126,7 +126,7 @@ def test_batched_rotary_embedding( query, key, offsets=torch.zeros(batch_size * seq_len, - dtype=int, + dtype=torch.long, device=device)) # Compare the results. assert torch.allclose(out_query, @@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora( def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] BASES = [10000, 1000000] - ROPE_SCALINGS = [ - None, { - "type": "linear", - "factor": (1, ) - }, { - "type": "dynamic", - "factor": 1 - } - ] - settings = [ - HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, - ROPE_SCALINGS, DTYPES - ] - rope_setting_id_map = {} + ROPE_SCALINGS = (None, { + "type": "linear", + "factor": (1, ) + }, { + "type": "dynamic", + "factor": 1 + }) + settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, + ROPE_SCALINGS, DTYPES) + rope_setting_id_map: Dict[str, int] = {} for setting in product(*settings): head_size, rotary_dim, max_position, base, \ is_neox_stype, rope_scaling, dtype = setting diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 522c635b8..4eab73a71 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,6 +2,7 @@ import contextlib import gc import tempfile from collections import OrderedDict +from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest @@ -24,7 +25,18 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model -LONG_LORA_INFOS = [{ + +class ContextIDInfo(TypedDict): + lora_id: int + context_length: str + + +class ContextInfo(TypedDict): + lora: str + context_length: str + + +LONG_LORA_INFOS: List[ContextIDInfo] = [{ "lora_id": 1, "context_length": "16k", }, { @@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): cleanup() - infos = {} + infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] if lora_id == 1: @@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture -def llama_2_7b_engine_extra_embeddings() -> nn.Module: +def llama_2_7b_engine_extra_embeddings(): cleanup() get_model_old = get_model @@ -244,7 +256,6 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module: @pytest.fixture -def llama_2_7b_model_extra_embeddings( - llama_2_7b_engine_extra_embeddings) -> nn.Module: +def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings): yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. model_runner.model) diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 653e68274..61b8899f0 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -1,7 +1,29 @@ # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" -prompts_and_responses = { +from typing import Dict, List, TypedDict + + +class DateJSON(TypedDict): + day: int + month: int + year: int + + +class AnswerJSON(TypedDict): + nationality: str + date_of_birth: DateJSON + date_of_death: DateJSON + politician: bool + sportsperson: bool + + +class PromptResponse(TypedDict): + prompt: str + golden_answer: AnswerJSON + + +prompts_and_responses: Dict[str, List[PromptResponse]] = { "16k": [{ "prompt": "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 5ab863eea..e1b81655c 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -10,7 +12,7 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index bd8cc98ef..de4cbea80 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,3 +1,5 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest @@ -6,7 +8,7 @@ MODEL_PATH = "THUDM/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 0082c6e74..709246179 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,10 +1,12 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest MODEL_PATH = "google/gemma-7b" -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", @@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 7d37aa647..ec9776b77 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int): return lora_model -def do_sample(llm, +def do_sample(llm: vllm.LLM, lora_path: Optional[str] = None, lora_id: Optional[int] = None, logprobs: int = 0, @@ -42,8 +42,8 @@ def do_sample(llm, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] - generated_logprobs = [] + generated_texts: List[str] = [] + generated_logprobs: List[List[List[int]]] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index fc4445c65..4b489670f 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -109,7 +109,7 @@ def populate_loras( for slot_idx, lora_id in enumerate(id_to_index): if lora_id is not None: - subloras = [] + subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): sublora = DummyLoRAManager().init_random_lora( @@ -158,7 +158,10 @@ def create_random_inputs( low, high = input_range - inputs, index_mapping, prompt_mapping = [], [], [] + inputs: List[torch.Tensor] = [] + index_mapping: List[int] = [] + prompt_mapping: List[int] = [] + for _ in range(num_inputs): if input_type == torch.int: inputs.append( @@ -222,7 +225,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None: lora_result = lora_embedding(torch.cat(inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = embedding(input_) @@ -356,7 +359,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, lora_result = lora_embedding(torch.cat(original_inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, original_input_, lora_id in zip(inputs, original_inputs, prompt_mapping): lora = lora_dict[lora_id] @@ -482,7 +485,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, logits_processor.org_vocab_size = (vocab_size + lora_config.lora_extra_vocab_size) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits(hidden_states=input_, @@ -598,7 +601,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -729,7 +732,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): result = linear(input_)[0] subloras = sublora_dict[lora_id] @@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed): computed_added_vocab_size = 0 vocab_size_padded = -1 - all_org_tokens = [] - all_added_tokens = [] - token_ids = [] + all_org_tokens: List[int] = [] + all_added_tokens: List[int] = [] + token_ids: List[int] = [] for tp_rank in range(tp_size): with patch( diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 7143a99be..ad8490353 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import ray @@ -9,7 +11,7 @@ from .conftest import cleanup MODEL_PATH = "meta-llama/Llama-2-7b-hf" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index b58145eda..b50784a20 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response): def generate( - llm, + llm: vllm.LLM, inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], ): prompts, sampling_param, lora_request = inputs @@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): non-batched generation. """ # Create non batched results first to compare against batched results - non_batched_results = [] + non_batched_results: List[str] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): # Create batched results # Each element of the batch must be # (prompt, prompt_sampling_params, prompt_lora_request) - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos): num_loras = len(long_context_infos) # Create results in order of long_context_infos - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos): The test is expected to run for about 1 minute on a p4de.24xlarge instance. """ - scores = [] + scores: List[float] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] for prompt_and_response in prompts_and_responses[context_len]: @@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos): generate(lora_llm, (bad_prompt, sampling_params, lora_request)) # Also test batched - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id_with_bad_inputs in long_context_infos: for lora_id, info in long_context_infos.items(): context_len = info["context_length"] diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index d4d1665b6..3514dcb7a 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from vllm.lora.models import LoRAModel @@ -17,7 +19,7 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c08eee991..51a56b121 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,5 @@ import os -from typing import List +from typing import Dict, List import pytest import torch @@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files): def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str]) -> LoRAModel: - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight loras[name] = LoRALayerWeights( @@ -83,7 +83,7 @@ def create_packed_lora( empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for replaced_module_name in replaced_module_names: if replaced_module_name == empty_replaced_module_name: continue diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index f6a8a50fa..e7e7724fc 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -7,7 +9,7 @@ from vllm.lora.request import LoRARequest MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 @@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a2b42ce4c..733eff48a 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,3 +1,5 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest @@ -6,7 +8,7 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format( sql_prompt= @@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: if lora_id else None, ) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 3d86a4366..8fd968c69 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -25,7 +25,10 @@ MODELS: List[ModelWithQuantization] = [ ] -def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256): +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + max_tokens: int = 256) -> List[str]: raw_prompts = [ "Give me an orange-ish brown color", "Give me a neon pink color", @@ -45,7 +48,7 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 280e0f204..b73cf5bf5 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Dict, List, Optional import torch @@ -9,13 +9,13 @@ class DummyLoRAManager: def __init__(self): super().__init__() - self._loras = {} + self._loras: Dict[str, LoRALayerWeights] = {} def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora - def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]: - return self._loras.get(module_name, None) + def get_module_lora(self, module_name: str) -> LoRALayerWeights: + return self._loras[module_name] def init_random_lora(self, module_name: str, @@ -68,11 +68,11 @@ class DummyLoRAManager: module_name: str, input_dim: int, output_dims: List[int], - noop_lora_index: List[int] = None, - rank=8, + noop_lora_index: Optional[List[int]] = None, + rank: int = 8, ): - base_loras = [] - noop_lora_index = set(noop_lora_index or []) + base_loras: List[LoRALayerWeights] = [] + noop_lora_index_set = set(noop_lora_index or []) for i, out_dim in enumerate(output_dims): base_lora = self.init_lora( @@ -80,7 +80,7 @@ class DummyLoRAManager: input_dim, out_dim, rank=rank, - noop=i in noop_lora_index, + noop=i in noop_lora_index_set, ) base_loras.append(base_lora) packed_lora = PackedLoRALayerWeights.pack(base_loras) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 2b5609188..4ab968c01 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,6 +3,7 @@ Note: these tests will only pass on L4 GPU. """ import os +from typing import List import pytest import torch @@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None: ] params = SamplingParams(max_tokens=20, temperature=0) - generations = [] + generations: List[str] = [] # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 305596e16..7985001d3 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -2,8 +2,11 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from typing import List + import pytest +from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device @@ -43,7 +46,7 @@ def test_block_allocator( def test_eviction(num_blocks: int, ): block_size = 16 block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks) - blocks = [] + blocks: List[PhysicalTokenBlock] = [] for i in range(num_blocks): # use i as the block_hash diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 6820b2728..b63a8d01d 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -4,6 +4,7 @@ Run `pytest tests/quantization/test_configs.py --forked`. """ from dataclasses import dataclass +from typing import Tuple import pytest @@ -51,7 +52,7 @@ MODEL_ARG_EXPTYPES = [ @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) -def test_auto_gptq(model_arg_exptype: str) -> None: +def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype try: diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 233540cdc..02a953da0 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -62,21 +64,22 @@ def test_get_prompt_logprobs( for logprobs in result.outputs[0].logprobs: assert len(logprobs) == num_top_logprobs output_text = result.outputs[0].text - output_string_from_most_likely_tokens = [] + output_string_from_most_likely_tokens_lst: List[str] = [] for top_logprobs in result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens.append( + output_string_from_most_likely_tokens_lst.append( top_logprob.decoded_token) if detokenize: output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens) + output_string_from_most_likely_tokens_lst) assert output_text == output_string_from_most_likely_tokens, ( "The output text from the top logprob for each token position " "should be the same as the output text in the result.") else: assert output_text == '' - assert output_string_from_most_likely_tokens == [None] * max_tokens + assert output_string_from_most_likely_tokens_lst == ([None] * + max_tokens) # The first prompt logprob is always None assert result.prompt_logprobs[0] is None diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 00a237950..6dd643bbe 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution( draft_and_target_probs_equal) sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference = [] - distance_wrt_target = [] + distance_wrt_reference: List[float] = [] + distance_wrt_target: List[float] = [] for num_samples in sample_sizes: (reference_vs_rejsample_dist, diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index ddc66aa28..c6ef4358e 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,6 +1,6 @@ import itertools import random -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from unittest.mock import patch import pytest @@ -49,8 +49,8 @@ def _do_sample( sampling_params: SamplingParams, device: str, ): - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -212,7 +212,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): batch_size = random.randint(1, 128) expected_penalization = [] - sequence_metadata_list = [] + sequence_metadata_list: List[SequenceGroupMetadata] = [] # 20% chance to generate seq group metadata list with all prompts is_prompt = random.random() < 0.2 while batch_size > 0: @@ -232,8 +232,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): eos_token_id=eos_token_id, stop_token_ids=stop_token_ids) - seq_data = {} - seq_group_penalization = [] + seq_data: Dict[int, SequenceData] = {} + seq_group_penalization: List[bool] = [] for _ in range(num_seqs): num_input = random.randint(1, 100) num_generated = 0 if is_prompt else random.randint(1, 100) @@ -392,17 +392,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): else: test_cases = [generate_test_case()] - def run_test_case(*, - expected_penalization=None, - seq_group_metadata_list=None): + def run_test_case(*, expected_penalization: List[bool], + seq_group_metadata_list: List[SequenceGroupMetadata]): assert expected_penalization, \ "Invalid test case, need expected_penalization" assert seq_group_metadata_list, \ "Invalid test case, need seq_group_metadata_list" batch_size = 0 - seq_lens = [] - sampling_params_per_row = [] + seq_lens: List[int] = [] + sampling_params_per_row: List[SamplingParams] = [] for sgm in seq_group_metadata_list: sampling_params = sgm.sampling_params @@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str): batch_size = random.randint(1, 256) input_tensor, fake_logits, sampler = _prepare_test(batch_size) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] expected_tokens: List[Optional[List[int]]] = [] - seq_lens = [] + seq_lens: List[int] = [] for i in range(batch_size): expected: Optional[List[int]] = None sampling_type = random.randint(0, 3) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) - expected = [torch.argmax(fake_logits[i], dim=-1).item()] + expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] elif sampling_type in (1, 2): n = random.randint(1, 10) sampling_params = SamplingParams( @@ -536,15 +535,18 @@ def test_sampler_mixed(seed: int, device: str): ] continue + expected_tokens_item = expected_tokens[i] + assert expected_tokens_item is not None + for n, nth_output in enumerate(sequence_output.samples): if (metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed - assert nth_output.output_token == expected_tokens[i][n] + assert nth_output.output_token == expected_tokens_item[n] else: # For non-seeded random check that one of the high-logit # tokens were chosen - assert nth_output.output_token in expected_tokens[i] + assert nth_output.output_token in expected_tokens_item # Test batch test_sampling() @@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str): warpers = generation_model._get_logits_warper(generation_config) assert len(warpers) == 2 # top_p and top_k - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -622,6 +624,9 @@ def test_sampler_top_k_top_p(seed: int, device: str): with patch("vllm.model_executor.layers.sampler._sample", mock_sample): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) + + assert sample_probs is not None + hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) assert torch.allclose(hf_probs, sample_probs, atol=1e-5) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f8a6de546..86103cf85 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -118,16 +118,17 @@ class AsyncLLM: raise ValueError("The lengths of prompts and " "sampling_params must be the same.") - async def get_output(prompt, sampling_param) -> str: + async def get_output(prompt, sampling_param) -> RequestOutput: request_id = random_uuid() results_generator = self.llm_engine.generate( prompt, sampling_param, request_id) final_output = None async for request_output in results_generator: final_output = request_output + assert final_output is not None return final_output - outputs = [] + outputs: List[RequestOutput] = [] try: for i in range(num_requests): prompt = prompts[i] if prompts is not None else None @@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm): def get_output_from_llm_generator( llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: - tokens = [] - token_ids = [] + tokens: List[str] = [] + token_ids: List[List[int]] = [] for llm in llm_generator(): maybe_assert_ngram_worker(llm) @@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int], nvmlInit() start_time = time.time() while True: - output = {} - output_raw = {} + output: Dict[int, str] = {} + output_raw: Dict[int, float] = {} for device in devices: dev_handle = nvmlDeviceGetHandleByIndex(device) mem_info = nvmlDeviceGetMemoryInfo(dev_handle) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78dd..42dd90422 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int): device='cuda', ) - expected_output = [ + expected_output: List[List[int]] = [ [], ] for i in range(proposal_token_ids.shape[0]): expected_output.append(proposal_token_ids[:i + 1].tolist()) scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access actual_output = [ x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 6cea6668a..a6eb628f9 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,11 +1,12 @@ import random +from typing import Dict, List from unittest.mock import MagicMock import pytest import torch from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -210,7 +211,7 @@ def test_same_output_for_multi_step(): # Run single-step repeatedly. zero_kv_cache(worker.cache_engine) - single_step_output = [] + single_step_output: List[SamplerOutput] = [] continuations = [[1] for _ in prompts] set_random_seed(seed) @@ -232,11 +233,15 @@ def test_same_output_for_multi_step(): continuations[i].append(seq_group_output.samples[0].output_token) # Get token ids and logprobs for comparison. - multi_step_output_logprobs = [[] for _ in prompts] - single_step_output_logprobs = [[] for _ in prompts] - - multi_step_output_token_ids = [[] for _ in prompts] - single_step_output_token_ids = [[] for _ in prompts] + multi_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + single_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + + multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts] + single_step_output_token_ids: List[List[int]] = [[] for _ in prompts] for i, _ in enumerate(prompts): for multi_step, single_step in zip(multi_step_output, single_step_output): diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ef9d32f73..afaeffc96 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,5 +1,6 @@ import random from types import SimpleNamespace +from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -7,7 +8,7 @@ import torch from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) @@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)) - seen_contexts = [] + seen_contexts: List[List[int]] = [] call_args_list = target_worker.execute_model.call_args_list assert len(call_args_list) == 1 @@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): for seq_data in seq_group_metadata.seq_data.values(): seen_contexts.append(seq_data.get_token_ids()) - expected_seen_contexts = [] + expected_seen_contexts: List[List[int]] = [] for prompt, prev_generated, draft_tokens in zip( prompts, prev_output_tokens, proposal_token_ids.tolist()): @@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int): next(iter(seq_group_metadata.seq_data.keys())) for seq_group_metadata in seq_group_metadata_list ] - actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} - expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + actual_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } + expected_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } for step in output: for seq_group in step: diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d52b22c30..ce5b34783 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,5 +1,7 @@ from itertools import count -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import TypeVar, Union from unittest.mock import MagicMock import torch @@ -14,6 +16,8 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker +T = TypeVar("T", bound=Worker) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine): value_blocks.zero_() -def create_worker(cls: type, +def create_worker(cls: Callable[..., T], model_name: str, block_size: int, num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, - enforce_eager: bool = True): + enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, seed=seed, @@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose( def create_sampler_output_list( token_ids: torch.Tensor, - probs: Iterable[Optional[torch.Tensor]], - logprobs: Iterable[Optional[torch.Tensor]], + probs: GenericSequence[Optional[torch.Tensor]], + logprobs: GenericSequence[Optional[torch.Tensor]], seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: num_steps, batch_size = token_ids.shape token_ids_by_step = token_ids.tolist() diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 0fbe3dae1..fe413d122 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, max_input_length=None, ) - hashes = [] + hashes: List[List[List[int]]] = [] for prefix in prefixes: for lora_int_id in concurrent_lora_int_ids: diff --git a/tests/test_logger.py b/tests/test_logger.py index 74f1125fb..52aa73761 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration(): assert not logger.propagate handler = logger.handlers[0] + assert isinstance(handler, logging.StreamHandler) assert handler.stream == sys.stdout assert handler.level == logging.INFO diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 8d019fe5f..12e5ae85a 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, # Run sequentially. seq = create_sequence() dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token = [] - sequential_logprobs_text_other_token = [] + sequential_logprobs_text_chosen_token: List[str] = [] + sequential_logprobs_text_other_token: List[str] = [] for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs): seq.append_token_id(new_token, logprobs) diff --git a/tests/utils.py b/tests/utils.py index c84364d20..f2b2d22b1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -79,7 +79,7 @@ class RemoteOpenAIServer: self.host = str(args.host or 'localhost') self.port = int(args.port) - self._runner = self._RemoteRunner.remote( + self._runner = self._RemoteRunner.remote( # type: ignore cli_args, wait_url=self.url_for("health"), wait_timeout=self.MAX_SERVER_START_WAIT_S) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 514a57e17..dd0d3bf50 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size): enable_chunked_prefill=False, ) - seq_lens = [] - seq_group_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} for i in range(batch_size): # make sure all tokens fit into one block @@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size): enable_chunked_prefill=False, ) - context_lens = [] - seq_group_metadata_list = [] + context_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] # Assume each seq group finishes prefill. for i in range(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = list(range(context_len)) - seq_data = SequenceData(seq_data) + seq_data = SequenceData(list(range(context_len))) seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. seq_data.append_token_id(1, 0) @@ -257,7 +258,7 @@ def test_empty_seq_group(): dtype="float16", enforce_eager=False, ) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input(seq_group_metadata_list) input_tokens, input_positions, attn_metadata, slot_mapping = ( model_input.input_tokens, @@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ) # Add prefill requests. - seq_lens = [] - seq_group_metadata_list = [] - prefill_metadata_list = [] - decode_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + prefill_metadata_list: List[SequenceGroupMetadata] = [] + decode_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 4b08cce99..c01e0a0a3 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -245,7 +245,7 @@ def _make_alibi_bias( dtype: torch.dtype, seq_lens: List[int], ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses @@ -271,7 +271,7 @@ def _make_sliding_window_bias( window_size: Optional[int], dtype: torch.dtype, ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: tensor = torch.full( (1, seq_len, seq_len), diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 99a3e88bc..0fecd9f6e 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -431,8 +431,8 @@ def _make_alibi_bias( num_kv_heads: int, dtype: torch.dtype, seq_lens: List[int], -) -> LowerTriangularMaskWithTensorBias: - attn_biases = [] +) -> List[AttentionBias]: + attn_biases: List[AttentionBias] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 26f378ba2..d705f3d91 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -252,7 +252,7 @@ class BlockTable: def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], token_ids: List[int], device: Device) -> List[Block]: - blocks = [] + blocks: List[Block] = [] for block_token_ids in chunk_list(token_ids, self._block_size): if len(block_token_ids) == self._block_size: # If the block is full, create an immutable block. diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index d03378712..50f27bab3 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -111,7 +111,7 @@ class NaiveBlockAllocator(BlockAllocator): """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 88dbbfb2f..2df7d74e4 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -271,7 +271,7 @@ class PrefixCachingBlockAllocator(BlockAllocator): """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: refcount = self._refcounter.incr(block.block_id) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 121092cf1..309775237 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -260,7 +260,7 @@ class BlockSpaceManagerV2(BlockSpaceManager): # at max extend. if self.enable_caching: block_table = self.block_tables[seq.seq_id] - block_ids = [] + block_ids: List[Optional[int]] = [] for block_id in block_table.physical_block_ids: block_ids.append(block_id) self.block_allocator.mark_blocks_as_accessed( diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index e6957b119..75b7c374c 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -2,7 +2,7 @@ import ctypes import json import os from itertools import product -from typing import Dict, Optional, Sequence +from typing import Dict, List, Optional, Sequence import torch.distributed as dist import torch.multiprocessing as mp @@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int], def can_actually_p2p( batch_src: Sequence[int], batch_tgt: Sequence[int], -): +) -> Sequence[bool]: """ Usually, checking if P2P access is enabled can be done by `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes @@ -138,7 +138,7 @@ def can_actually_p2p( p_tgt.start() p_src.join() p_tgt.join() - result = [] + result: List[bool] = [] for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() @@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: # only the local master process (with local_rank == 0) can # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) - cache = {} + cache: Dict[str, bool] = {} ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 50d6719fb..7619c98f2 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -205,7 +205,7 @@ class NCCLLibrary: raise e if so_file not in NCCLLibrary.path_to_dict_mapping: - _funcs = {} + _funcs: Dict[str, Any] = {} for func in NCCLLibrary.exported_functions: f = getattr(self.lib, func.name) f.restype = func.restype diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b2f6478cb..fd64337d4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2,7 +2,7 @@ import time from contextlib import contextmanager from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional from typing import Sequence as GenericSequence -from typing import Type, TypeVar, Union +from typing import Set, Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer @@ -973,7 +973,7 @@ class LLMEngine: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: return self.model_executor.list_loras() def check_health(self) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ae7ae144b..027f5c7e7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -144,7 +144,7 @@ class Metrics: # end-metrics-definitions -def build_1_2_5_buckets(max_value: int): +def build_1_2_5_buckets(max_value: int) -> List[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values (1, 2, 5) until the value exceeds the specified maximum. @@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int): """ mantissa_lst = [1, 2, 5] exponent = 0 - buckets = [] + buckets: List[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index cad44f476..07a68c65a 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -146,8 +146,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): # Beam search case # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] + selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] + unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] beam_width = seq_group.sampling_params.best_of length_penalty = seq_group.sampling_params.length_penalty diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 7a6819c35..91e567924 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -2,6 +2,7 @@ import argparse import asyncio import sys from io import StringIO +from typing import Awaitable, List import aiohttp @@ -114,7 +115,7 @@ async def main(args): ) # Submit all requests in the file to the engine "concurrently". - response_futures = [] + response_futures: List[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): request = BatchRequestInput.model_validate_json(request_json) response_futures.append(run_request(openai_serving_chat, request)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7cd434fe0..769406124 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -487,7 +487,7 @@ class OpenAIServingChat(OpenAIServing): final_res = res assert final_res is not None - choices = [] + choices: List[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 5a3448de3..cbf09f173 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -25,7 +25,7 @@ def request_output_to_embedding_response( created_time: int, model_name: str, ) -> EmbeddingResponse: - data = [] + data: List[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): assert final_res is not None diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index d7794aa7c..8f3c7f769 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,4 +1,5 @@ from typing import List, Optional +from typing import Sequence as GenericSequence import torch @@ -120,7 +121,7 @@ class PackedLoRALayerWeights(LoRALayerWeights): @classmethod def pack( - cls, loras: List[Optional["LoRALayerWeights"]] + cls, loras: GenericSequence[Optional["LoRALayerWeights"]] ) -> "PackedLoRALayerWeights": """Pack a list of LoRAs into a single LoRA. diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4657757bd..498b2b9dd 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -165,7 +165,7 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager): model = self._lora_manager.model supported_lora_modules = model.supported_lora_modules packed_modules_mapping = model.packed_modules_mapping - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend( diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f5b6bdd9f..58c379bcd 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -393,7 +393,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param_data.copy_(loaded_weight) return current_shard_offset = 0 - shard_offsets = [] + shard_offsets: List[Tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ae440743f..599070f15 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -25,24 +25,25 @@ GPTQ_MARLIN_SUPPORTED_SYM = [True] # Permutations for Marlin scale shuffling -def get_scale_perms(num_bits): - scale_perm = [] +def get_scale_perms(num_bits: int): + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return scale_perm, scale_perm_single -def get_pack_factor(num_bits): +def get_pack_factor(num_bits: int): assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS ), f"Unsupported num_bits = {num_bits}" return 32 // num_bits -def marlin_permute_scales(s, size_k, size_n, group_size, num_bits): +def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int, + group_size: int, num_bits: int): scale_perm, scale_perm_single = get_scale_perms(num_bits) if group_size < size_k and group_size != -1: s = s.reshape((-1, len(scale_perm)))[:, scale_perm] diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py index 12e77cb71..93f65a20d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ import torch # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms_24(num_bits): - perm_list = [] +def get_perms_24(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 col_o = col // 2 for block in [0, 1]: @@ -39,18 +41,18 @@ def get_perms_24(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return perm, scale_perm, scale_perm_single -marlin_24_perm = {} -marlin_24_scale_perm = {} -marlin_24_scale_perm_single = {} +marlin_24_perm: Dict[int, torch.Tensor] = {} +marlin_24_scale_perm: Dict[int, List[int]] = {} +marlin_24_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits) marlin_24_perm[num_bits] = perm_24 diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py index 76bd2ff7c..db5e6857a 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ import torch # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms(num_bits): - perm_list = [] +def get_perms(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 for block in [0, 1]: for row in [ @@ -38,19 +40,19 @@ def get_perms(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return perm, scale_perm, scale_perm_single -marlin_perm = {} -marlin_scale_perm = {} -marlin_scale_perm_single = {} +marlin_perm: Dict[int, torch.Tensor] = {} +marlin_scale_perm: Dict[int, List[int]] = {} +marlin_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm, scale_perm, scale_perm_single = get_perms(num_bits) marlin_perm[num_bits] = perm diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index a84f56290..e07360a2f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -174,7 +174,7 @@ def _apply_min_tokens_penalty( min_tokens = sampling_params.min_tokens token_ids_to_penalize = sampling_params.all_stop_token_ids if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize = [] + seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] if len(seq_data.output_token_ids) < min_tokens: @@ -285,7 +285,7 @@ def _greedy_sample( same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ - samples = samples.tolist() + samples_lst = samples.tolist() sample_idx = 0 results: SampleResultType = [] for seq_group in selected_seq_groups: @@ -298,7 +298,7 @@ def _greedy_sample( assert num_parent_seqs == 1, ( "Greedy sampling should have only one seq.") parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples[sample_idx]] + next_token_ids = [samples_lst[sample_idx]] results.append((next_token_ids, parent_ids)) sample_idx += num_parent_seqs return results @@ -394,7 +394,7 @@ def _beam_search_sample( next_token_ids = next_token_ids.tolist() else: # Generation phase. - cumulative_logprobs: List[int] = [ + cumulative_logprobs: List[float] = [ seq_group.seq_data[seq_id].cumulative_logprob for seq_id in seq_ids ] @@ -466,8 +466,9 @@ def _sample_with_torch( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} - multinomial_samples = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample]]] = {} + multinomial_samples: Dict[SamplingType, torch.Tensor] = {} # Create output tensor for sampled token ids. if include_gpu_probs_tensor: @@ -494,7 +495,7 @@ def _sample_with_torch( greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = greedy_samples.unsqueeze(-1) @@ -522,7 +523,7 @@ def _sample_with_torch( probs[long_sample_indices], max_best_of_in_batch, **seeded_args) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = multinomial_samples[sampling_type] @@ -571,7 +572,9 @@ def _sample_with_triton_kernel( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample], + torch.Tensor, torch.Tensor]] = {} max_best_of_in_batch = 1 # Counterintiutively, having two loops here is actually faster. @@ -1008,14 +1011,14 @@ def _build_sampler_output( speculative decoding rejection sampling. """ - sampler_output = [] + sampler_output: List[CompletionSequenceGroupOutput] = [] for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, sample_results, prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result - seq_outputs = [] + seq_outputs: List[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip(parent_ids, next_token_ids, group_sample_logprobs): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 06de2fcc1..d3babcf9c 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -68,7 +68,7 @@ def _get_model_initialization_kwargs( vision_language_config: Optional[VisionLanguageConfig] ) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" - extra_kwargs = {} + extra_kwargs: Dict[str, Any] = {} if hasattr(model_class, "supported_lora_modules"): extra_kwargs["lora_config"] = lora_config elif lora_config: @@ -446,7 +446,8 @@ class ShardedStateLoader(BaseModelLoader): Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups = collections.defaultdict(list) + same_storage_groups: Dict[Any, List[Tuple[ + str, torch.Tensor]]] = collections.defaultdict(list) for key, tensor in tensors.items(): if tensor.numel(): ptr = tensor.untyped_storage().data_ptr() @@ -455,7 +456,7 @@ class ShardedStateLoader(BaseModelLoader): def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result = {} + result: Dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 827591b22..943022a3f 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -329,7 +329,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names = [] + weight_names: List[str] = [] for bin_file in hf_weights_files: state = torch.load(bin_file, map_location="cpu") for name, param in state.items(): diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 4446914c6..bed6f518c 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -72,11 +72,11 @@ _MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS} _OOT_MODELS: Dict[str, Type[nn.Module]] = {} # Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS = [] +_ROCM_UNSUPPORTED_MODELS: List[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. -_ROCM_PARTIALLY_SUPPORTED_MODELS = { +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { "Qwen2ForCausalLM": "Sliding window attention is not yet supported in ROCm's flash attention", "MistralForCausalLM": diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 313762b13..577761107 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -453,8 +453,8 @@ class ArcticForCausalLM(nn.Module): ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping = [] - expert_params_mapping = [] + mlp_params_mapping: List[Tuple[str, str, int]] = [] + expert_params_mapping: List[Tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 84786921c..11d88d45e 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -20,7 +20,7 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.utils.checkpoint @@ -352,7 +352,7 @@ class CohereForCausalLM(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 27dda00b6..65f4ebec5 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,7 +15,7 @@ # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" from functools import lru_cache -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -363,7 +363,7 @@ class GemmaForCausalLM(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/sequence.py b/vllm/sequence.py index 2f27bf33b..54243bfb1 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -123,7 +123,7 @@ class SequenceData: output_token_ids = [] self.prompt_token_ids = prompt_token_ids - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) + self._prompt_token_ids_tuple = tuple(prompt_token_ids) self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 # The number of tokens that are computed (that run against the model). diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index fe15ea33b..668ceefe6 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,10 +1,10 @@ import copy import weakref -from typing import List, Tuple +from typing import Dict, List, Tuple import torch -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase @@ -71,7 +71,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): sample_len) # Run model sample_len times. - model_outputs = [] + model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): model_output = super().execute_model( execute_model_req=copied_execute_model_req) @@ -132,7 +132,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): # Shallow-copy the list of SequenceGroupMetadata. This allows us to # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] + new_seq_group_metadata_list: List[SequenceGroupMetadata] = [] for old_seq_group_metadata in seq_group_metadata_list: # We must shallow-copy seq_group_metadata as is_prompt could change. @@ -140,7 +140,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): new_seq_group_metadata_list.append(seq_group_metadata) # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} + new_seq_data: Dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): new_seq_data[seq_id] = copy.copy(old_seq_data) new_seq_data[ diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 33af588d0..23a3e1649 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -48,7 +48,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase): self, execute_model_req: ExecuteModelRequest, sample_len: int, - ) -> Tuple[Optional[List[SamplerOutput]], bool]: + ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: """NGram match algo to pick proposal candidate. Returns the list of sampler output, one per SequenceGroupMetadata. @@ -58,8 +58,8 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase): self._raise_if_unsupported(execute_model_req) has_spec_out = False - token_id_list = [] - token_prob_list = [] + token_id_list: List[Optional[torch.Tensor]] = [] + token_prob_list: List[Optional[torch.Tensor]] = [] for idx, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data = next(iter(seq_group_metadata.seq_data.values())) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8b147c806..03fad5663 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -7,8 +7,8 @@ from vllm.config import SpeculativeConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, + SamplerOutput, SequenceGroupMetadata) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -516,13 +516,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): topk_indices_by_step = topk_indices_by_step.tolist() # Construct the output on a per-step, per-sequence basis. - sampler_output_list = [] + sampler_output_list: List[SamplerOutput] = [] for step_index in range(num_steps): if all(token_id == -1 for token_id in accepted_token_ids_by_step[step_index]): break - step_output_token_ids = [] + step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 60ed9d39e..9bbe3f8d1 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -26,10 +26,10 @@ def get_all_num_logprobs( sequence. """ - all_num_logprobs = [] + all_num_logprobs: List[int] = [] for seq_group_metadata in seq_group_metadata_list: num_logprobs = seq_group_metadata.sampling_params.logprobs - if seq_group_metadata.sampling_params.logprobs is None: + if num_logprobs is None: num_logprobs = 0 all_num_logprobs.append(num_logprobs) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index f064c26c3..e8e53f494 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -44,7 +44,7 @@ class Detokenizer: read_offset = 0 next_iter_prefix_offset = 0 next_iter_read_offset = 0 - next_iter_tokens = [] + next_iter_tokens: List[str] = [] prev_tokens = None for token_position, prompt_logprobs_for_token in enumerate( diff --git a/vllm/utils.py b/vllm/utils.py index b5c42605b..9b39ca77a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -20,12 +20,13 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, import numpy as np import psutil import torch +import torch.types +from typing_extensions import ParamSpec import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import enable_trace_function_call, init_logger -T = TypeVar("T") logger = init_logger(__name__) STR_DTYPE_TO_TORCH_DTYPE = { @@ -37,6 +38,10 @@ STR_DTYPE_TO_TORCH_DTYPE = { "fp8_e5m2": torch.uint8, } +P = ParamSpec('P') +K = TypeVar("K") +T = TypeVar("T") + class Device(enum.Enum): GPU = enum.auto() @@ -176,7 +181,7 @@ def random_uuid() -> str: @lru_cache(maxsize=None) -def get_vllm_instance_id(): +def get_vllm_instance_id() -> str: """ If the environment variable VLLM_INSTANCE_ID is set, return it. Otherwise, return a random UUID. @@ -192,7 +197,7 @@ def in_wsl() -> bool: return "microsoft" in " ".join(uname()).lower() -def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: +def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: """Take a blocking function, and run it on in an executor thread. This function prevents the blocking function from blocking the @@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: The code in this function needs to be thread safe. """ - def _async_wrapper(*args, **kwargs) -> asyncio.Future: + def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: loop = asyncio.get_event_loop() p_func = partial(func, *args, **kwargs) return loop.run_in_executor(executor=None, func=p_func) @@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def chunk_list(lst, chunk_size): +def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]: """Yield successive chunk_size chunks from lst.""" return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] @@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int: def _generate_random_fp8( - tensor: torch.tensor, + tensor: torch.Tensor, low: float, high: float, ) -> None: @@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash( torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) scale = head_size**-0.5 - key_caches, value_caches = [], [] + + key_caches: List[torch.Tensor] = [] + value_caches: List[torch.Tensor] = [] + for _ in range(num_layers): key_value_cache = torch.empty(size=key_value_cache_shape, dtype=torch_dtype, @@ -429,7 +437,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches = [] + key_caches: List[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, @@ -444,7 +452,7 @@ def create_kv_caches_with_random( key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches = [] + value_caches: List[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, @@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool: class CudaMemoryProfiler: - def __init__(self, device=None): + def __init__(self, device: Optional[torch.types.Device] = None): self.device = device def current_memory_usage(self) -> float: @@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() -def merge_dicts(dict1: Dict[Any, List[Any]], - dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: +def merge_dicts(dict1: Dict[K, List[T]], + dict2: Dict[K, List[T]]) -> Dict[K, List[T]]: """Merge 2 dicts that have key -> List of items. When a key conflicts, the values in dict1 is prioritized. """ - merged_dict = defaultdict(list) + merged_dict: Dict[K, List[T]] = defaultdict(list) for key, value in dict1.items(): merged_dict[key].extend(value) @@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]], return dict(merged_dict) -def init_cached_hf_modules(): +def init_cached_hf_modules() -> None: """ Lazy initialization of the Hugging Face modules. """ @@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str: return locs[0] -def find_nccl_library(): +def find_nccl_library() -> str: """ We either use the library file specified by the `VLLM_NCCL_SO_PATH` environment variable, or we find the library file brought by PyTorch. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 476e9ba3b..d0baa4337 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -779,8 +779,8 @@ class ModelRunner: # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests = [] - dummy_lora_requests_per_seq = [] + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] if self.lora_config: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 258f31de1..3d52fd71e 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -99,8 +99,8 @@ class WorkerWrapperBase: """ def __init__(self, - worker_module_name=None, - worker_class_name=None, + worker_module_name: str, + worker_class_name: str, trust_remote_code: bool = False) -> None: self.worker_module_name = worker_module_name self.worker_class_name = worker_class_name -- GitLab From 81fbb3655f37e2b3ccbe0e17276c5d813b886417 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Jun 2024 19:29:42 +0800 Subject: [PATCH 054/376] [CI/Build] Test both text and token IDs in batched OpenAI Completions API (#5568) --- tests/entrypoints/test_openai_server.py | 88 +++++++++++++------------ 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index d66b9b0fd..c22a675ff 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -655,50 +655,52 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI, [MODEL_NAME, "zephyr-lora"], ) async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary - # for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" + # test both text and token IDs + for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): + # test simple list + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + ) + assert len(batch.choices) == 2 + assert batch.choices[0].text == batch.choices[1].text - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] + # test n = 2 + batch = await client.completions.create( + model=model_name, + prompt=prompts, + n=2, + max_tokens=5, + temperature=0.0, + extra_body=dict( + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. + use_beam_search=True), + ) + assert len(batch.choices) == 4 + assert batch.choices[0].text != batch.choices[ + 1].text, "beam search should be different" + assert batch.choices[0].text == batch.choices[ + 2].text, "two copies of the same prompt should be the same" + assert batch.choices[1].text == batch.choices[ + 3].text, "two copies of the same prompt should be the same" + + # test streaming + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + stream=True, + ) + texts = [""] * 2 + async for chunk in batch: + assert len(chunk.choices) == 1 + choice = chunk.choices[0] + texts[choice.index] += choice.text + assert texts[0] == texts[1] @pytest.mark.asyncio -- GitLab From e691918e3bd75a05bc473c77577c494aa6442640 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Sat, 15 Jun 2024 23:59:36 +0900 Subject: [PATCH 055/376] [misc] Do not allow to use lora with chunked prefill. (#5538) Co-authored-by: Cyrus Leung --- vllm/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index d9e4a619e..54f36e1d6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1092,6 +1092,8 @@ class LoRAConfig: "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " "LoRA is enabled.") + if scheduler_config.chunked_prefill_enabled: + raise ValueError("LoRA is not supported with chunked prefill yet.") @dataclass -- GitLab From d919ecc771ece6995a949c3d4284c534a2bd0890 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Sat, 15 Jun 2024 13:38:16 -0400 Subject: [PATCH 056/376] add gptq_marlin test for bug report https://github.com/vllm-project/vllm/issues/5088 (#5145) --- tests/models/test_gptq_marlin.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index e30100d9b..4abbc41c9 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -40,6 +40,9 @@ MODELS = [ ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), # 8-bit, act_order==True, group_size=32 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), + + # 4-bit, act_order==True, group_size=128 + ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") ] -- GitLab From 1c0afa13c57766641e75172ff1cac2e09f79a3b9 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 15 Jun 2024 16:30:51 -0700 Subject: [PATCH 057/376] [BugFix] Don't start a Ray cluster when not using Ray (#5570) --- vllm/config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 54f36e1d6..c0d294ce9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -616,9 +616,14 @@ class ParallelConfig: "required for multi-node inference") backend = "ray" elif ray_found: - from ray.util import get_current_placement_group - if self.placement_group or get_current_placement_group(): + if self.placement_group: backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): + from ray.util import get_current_placement_group + if get_current_placement_group(): + backend = "ray" self.distributed_executor_backend = backend logger.info("Defaulting to use %s for distributed inference", backend) -- GitLab From 3ce2c050dd919542ef5355635edf71349ea597f2 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Sat, 15 Jun 2024 16:57:54 -0700 Subject: [PATCH 058/376] [Fix] Correct OpenAI batch response format (#5554) --- vllm/entrypoints/openai/protocol.py | 13 ++++++++++++- vllm/entrypoints/openai/run_batch.py | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b56ad63f..b57d79859 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel): body: Union[ChatCompletionRequest, ] +class BatchResponseData(OpenAIBaseModel): + # HTTP status code of the response. + status_code: int = 200 + + # An unique identifier for the API request. + request_id: str + + # The body of the response. + body: Union[ChatCompletionResponse, ] + + class BatchRequestOutput(OpenAIBaseModel): """ The per-line object of the batch output and error files @@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel): # inputs. custom_id: str - response: Optional[ChatCompletionResponse] + response: Optional[BatchResponseData] # For requests that failed with a non-HTTP error, this will contain more # information on the cause of the failure. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 91e567924..b0c0f4ad2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -10,7 +10,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, - ChatCompletionResponse) + BatchResponseData, + ChatCompletionResponse, + ErrorResponse) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext @@ -77,20 +79,27 @@ async def run_request(chat_serving: OpenAIServingChat, request: BatchRequestInput) -> BatchRequestOutput: chat_request = request.body chat_response = await chat_serving.create_chat_completion(chat_request) + if isinstance(chat_response, ChatCompletionResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=chat_response, + response=BatchResponseData( + body=chat_response, request_id=f"vllm-batch-{random_uuid()}"), error=None, ) - else: + elif isinstance(chat_response, ErrorResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=None, + response=BatchResponseData( + status_code=chat_response.code, + request_id=f"vllm-batch-{random_uuid()}"), error=chat_response, ) + else: + raise ValueError("Request must not be sent in stream mode") + return batch_output -- GitLab From f31c1f90e381967d25591a8928782d8a6a13693e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Sun, 16 Jun 2024 00:48:02 -0700 Subject: [PATCH 059/376] Add basic correctness 2 GPU tests to 4 GPU pipeline (#5518) --- .buildkite/test-pipeline.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6a2932db9..6439a315e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -32,7 +32,7 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 -- label: Distributed Tests +- label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -50,12 +50,16 @@ steps: - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py -- label: Distributed Tests (Multiple Groups) +- label: Distributed Tests (4 GPUs) #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: - pytest -v -s distributed/test_pynccl.py + # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. + # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - label: Engine Test mirror_hardwares: [amd] -- GitLab From 4a6769053ab2616f7f490e6ec5b8241e76ef0c2a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sun, 16 Jun 2024 10:07:34 -0400 Subject: [PATCH 060/376] [CI][BugFix] Flip is_quant_method_supported condition (#5577) --- tests/quantization/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 0c92d565d..29085916a 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -10,5 +10,5 @@ def is_quant_method_supported(quant_method: str) -> bool: capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - return (capability < + return (capability >= QUANTIZATION_METHODS[quant_method].get_min_capability()) -- GitLab From f07d5133202c08899eb5f51134af0f43b7791a33 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 16 Jun 2024 16:07:01 -0700 Subject: [PATCH 061/376] [build][misc] limit numpy version (#5582) --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index bf9987e3a..32e2ebe8c 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -2,7 +2,7 @@ cmake >= 3.21 ninja # For faster builds. psutil sentencepiece # Required for LLaMA tokenizer. -numpy +numpy < 2.0.0 requests py-cpuinfo transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. -- GitLab From 845a3f26f9706acafe8fa45ae452846d8cc3b97f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 16 Jun 2024 19:08:01 -0700 Subject: [PATCH 062/376] [Doc] add debugging tips for crash and multi-node debugging (#5581) --- docs/source/getting_started/debugging.rst | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index ff37f4e62..a22bba147 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -24,6 +24,8 @@ If you have already taken care of the above issues, but the vLLM instance still With more logging, hopefully you can find the root cause of the issue. +If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. + Here are some common issues that can cause hangs: - **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. @@ -31,15 +33,26 @@ Here are some common issues that can cause hangs: .. code-block:: python - # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py` - # adjust `--nproc-per-node` to the number of GPUs you want to use. import torch import torch.distributed as dist dist.init_process_group(backend="nccl") - data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}") + local_rank = dist.get_rank() % torch.cuda.device_count() + data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}") dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.cuda.synchronize() value = data.mean().item() assert value == dist.get_world_size() +.. tip:: + + Save the script as ``test.py``. + + If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use. + + If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``: + + - is the correct IP address of the master node + - is reachable from all nodes + - is set before running the script. + If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. -- GitLab From e2b85cf86a522e734a38b1d0314cfe9625003ef9 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sun, 16 Jun 2024 23:48:06 -0700 Subject: [PATCH 063/376] Fix w8a8 benchmark and add Llama-3-8B (#5562) --- .../cutlass_benchmarks/w8a8_benchmarks.py | 21 ++++++++++++------- .../cutlass_benchmarks/weight_shapes.py | 6 ++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 182105f0b..523e970c2 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -46,7 +46,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int, # impl -def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, +def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, scale_b: torch.tensor, out_dtype: torch.dtype) -> torch.tensor: return torch.mm(a, b) @@ -115,7 +115,7 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers.append( bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, - torch.bfloat16, label, sub_label, pytorch_i8_impl, + torch.bfloat16, label, sub_label, pytorch_mm_impl, "pytorch_bf16_bf16_bf16_matmul-no-scales")) # cutlass impl @@ -136,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers = [] + # pytorch impl w. bf16 + timers.append( + bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, + torch.bfloat16, label, sub_label, pytorch_mm_impl, + "pytorch_bf16_bf16_bf16_matmul-no-scales")) + # pytorch impl: bf16 output, without fp8 fast accum timers.append( bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, @@ -160,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # cutlass impl: bf16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm")) # cutlass impl: fp16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.float16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_fp16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm")) return timers diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index 7ad4a53d3..25ec9d602 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -22,6 +22,12 @@ WEIGHT_SHAPES = { ([4096, 22016], 1), ([11008, 4096], 0), ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], "meta-llama/Llama-2-13b-hf": [ ([5120, 15360], 1), ([5120, 5120], 0), -- GitLab From 9333fb8eb9ed6a62d33ef4d56d589f83a0f19233 Mon Sep 17 00:00:00 2001 From: Amit Garg Date: Mon, 17 Jun 2024 09:04:14 -0700 Subject: [PATCH 064/376] [Model] Rename Phi3 rope scaling type (#5595) --- vllm/config.py | 5 ++++- .../model_executor/layers/rotary_embedding.py | 19 ++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index c0d294ce9..552d5033f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1287,7 +1287,10 @@ def _get_and_verify_max_len( derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) - if rope_scaling is not None and rope_scaling["type"] != "su": + # The correct one should be "longrope", kept "su" here + # to be backward compatible + if rope_scaling is not None and rope_scaling["type"] != "su" \ + and rope_scaling["type"] != "longrope": if disable_sliding_window: # TODO(robertgshaw): Find a model that supports rope_scaling # with sliding window to see if this case should be allowed. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 792c47293..5a4940acb 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -467,7 +467,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): return cache -class Phi3SuScaledRotaryEmbedding(nn.Module): +class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): """Phi3 family of models scaled rotary embedding. Based on the original RotaryEmbedding implementation. @@ -491,11 +491,12 @@ class Phi3SuScaledRotaryEmbedding(nn.Module): if rotary_dim != head_size: raise ValueError( - f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \ - head_size ({rotary_dim}!={head_size}).") + f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \ + rotary_dim != head_size ({rotary_dim}!={head_size}).") if is_neox_style is False: raise ValueError( - "`Phi3SuScaledRotaryEmbedding` only supports neox_style.") + "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." + ) self.head_size = head_size self.max_position_embeddings = max_position_embeddings @@ -608,7 +609,9 @@ def get_rope( is_neox_style, dtype) else: scaling_type = rope_scaling["type"] - if scaling_type != "su": + # The correct one should be "longrope" but keep "su" here + # for backward compatible + if scaling_type != "su" and scaling_type != "longrope": scaling_factor = rope_scaling["factor"] if scaling_type == "linear": rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, @@ -633,7 +636,9 @@ def get_rope( base, is_neox_style, scaling_factor, dtype, **extra_kwargs) - elif scaling_type == "su": + # The correct one should be "longrope" but keep "su" here + # for backward compatible + elif scaling_type == "su" or scaling_type == "longrope": short_factor = rope_scaling["short_factor"] long_factor = rope_scaling["long_factor"] original_max_position = rope_scaling[ @@ -643,7 +648,7 @@ def get_rope( for k, v in rope_scaling.items() if k in ("short_mscale", "long_mscale") } - rotary_emb = Phi3SuScaledRotaryEmbedding( + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( head_size, rotary_dim, max_position, original_max_position, base, is_neox_style, dtype, short_factor, long_factor, **extra_kwargs) -- GitLab From 9e74d9d003d546c17dca472c3f4b48be651f1d7c Mon Sep 17 00:00:00 2001 From: Charles Riggins Date: Tue, 18 Jun 2024 00:05:33 +0800 Subject: [PATCH 065/376] Correct alignment in the seq_len diagram. (#5592) Co-authored-by: Liqian Chen --- vllm/attention/backends/flash_attn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 300bab728..1c48e2a0b 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -83,7 +83,7 @@ class FlashAttentionMetadata(AttentionMetadata): # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| + # |-------------------- seq_len ---------------------| # |-- query_len ---| # Maximum query length in the batch. None for decoding. -- GitLab From 890d8d960bb441b4ac46588492db7f16b6da78d7 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 17 Jun 2024 12:32:48 -0400 Subject: [PATCH 066/376] [Kernel] `compressed-tensors` marlin 24 support (#5435) --- tests/quantization/test_compressed_tensors.py | 23 ++- .../compressed_tensors/compressed_tensors.py | 48 ++++--- .../compressed_tensors/schemes/__init__.py | 2 + .../schemes/compressed_tensors_w4a16_24.py | 134 ++++++++++++++++++ .../quantization/compressed_tensors/utils.py | 8 ++ 5 files changed, 196 insertions(+), 19 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 5670498f2..611c6b8b7 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -9,7 +9,8 @@ import torch from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16, - CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, + CompressedTensorsW8A8StaticTensor) def test_compressed_tensors_w8a8_static_setup(vllm_runner): @@ -51,8 +52,7 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" - with vllm_runner(model_path, enforce_eager=True, - dtype=torch.float16) as llm: + with vllm_runner(model_path, dtype=torch.float16) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -83,3 +83,20 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args): assert qkv_proj.weight_packed.dtype is torch.int32 assert qkv_proj.weight_scale.dtype is torch.float16 assert qkv_proj.weight_packed.pack_factor == 8 + + +def test_compressed_tensors_w4a16_marlin24(vllm_runner): + model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" + with vllm_runner(model_path) as llm: + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + sampling_params = SamplingParams() + output = llm.generate("Hello world!", sampling_params=sampling_params) + assert output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index e134a26ef..92a84b3c0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -8,16 +8,20 @@ from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, CompressedTensorsW4A16, - CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, + CompressedTensorsW8A8StaticTensor) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) + CompressionFormat, QuantizationArgs, QuantizationStrategy, + find_first_name_or_class_match) class CompressedTensorsConfig(QuantizationConfig): - def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str]): + def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str], + quant_format: str): self.ignore = ignore self.layer_quant_details = layer_quant_details + self.quant_format = quant_format def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) @@ -46,6 +50,7 @@ class CompressedTensorsConfig(QuantizationConfig): def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": layer_quant_details: Dict[str, Any] = dict() ignore: List[str] = config.get("ignore", None) + quant_format: str = config.get("format", None) # The quant_config has multiple config_groups, each containing # an input_activations key with details about how the activations are @@ -69,7 +74,9 @@ class CompressedTensorsConfig(QuantizationConfig): except Exception: layer_quant_details[target]["input_activations"] = None - return cls(layer_quant_details=layer_quant_details, ignore=ignore) + return cls(layer_quant_details=layer_quant_details, + ignore=ignore, + quant_format=quant_format) @classmethod def get_config_filenames(cls) -> List[str]: @@ -110,17 +117,26 @@ class CompressedTensorsConfig(QuantizationConfig): input_quant: BaseModel) -> "CompressedTensorsScheme": if self._is_w4a16(weight_quant, input_quant): - return CompressedTensorsW4A16(num_bits=weight_quant.num_bits, - strategy=weight_quant.strategy, - group_size=weight_quant.group_size) - - if self._is_static_tensor_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8StaticTensor() - - if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken() - - raise NotImplementedError("Scheme not supported.") + if self.quant_format == CompressionFormat.marlin_24.value: + return CompressedTensorsW4A16Sparse24( + strategy=weight_quant.strategy, + num_bits=weight_quant.num_bits, + group_size=weight_quant.group_size) + if self.quant_format == CompressionFormat.pack_quantized.value: + return CompressedTensorsW4A16( + num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + group_size=weight_quant.group_size) + + if self.quant_format == CompressionFormat.int_quantized.value: + if self._is_static_tensor_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8StaticTensor() + + if self._is_dynamic_token_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8DynamicToken() + + raise NotImplementedError( + "No compressed-tensors compatible scheme was found.") def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme": @@ -165,9 +181,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase): scheme = self.quantization_config.get_scheme(layer=layer) scheme.create_weights( layer=layer, + input_size=input_size, input_size_per_partition=input_size_per_partition, output_partition_sizes=output_partition_sizes, - input_size=input_size, output_size=output_size, params_dtype=params_dtype, weight_loader=weight_loader) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index dc84d0008..3c95aa11f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -2,6 +2,8 @@ from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 from .compressed_tensors_unquantized import ( # noqa: F401 CompressedTensorsUnquantized) from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401 +from .compressed_tensors_w4a16_24 import ( # noqa: F401 + CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 CompressedTensorsW8A8DynamicToken) from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py new file mode 100644 index 000000000..d7e04ddb8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -0,0 +1,134 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW4A16Sparse24"] + + +class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme): + + def __init__(self, + strategy: str, + num_bits: int, + group_size: Optional[int] = None): + self.strategy = strategy + self.group_size = group_size + self.num_bits = num_bits + self.tile_size = 16 + + if self.strategy == "group" and self.group_size is None: + raise ValueError( + "group_size must be given when using strategy group") + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + pack_factor = 32 // self.num_bits + output_size_per_partition = sum(output_partition_sizes) + + qweight = Parameter( + torch.empty( + input_size_per_partition // self.tile_size // 2, + output_size_per_partition * self.tile_size // pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": pack_factor, + "marlin_tile_size": self.tile_size, + "weight_loader": weight_loader + }, + ) + + layer.register_parameter("weight_packed", qweight) + + input_groups = (1 if self.group_size is None else + input_size_per_partition // self.group_size) + + scales = Parameter( + torch.empty( + input_groups, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + "output_dim": 1, + "input_dim": None if input_groups == 1 else 0, + "weight_loader": weight_loader + }, + ) + layer.register_parameter("scale_packed", scales) + + weight_shape = Parameter(torch.empty(2, dtype=torch.int64), + requires_grad=False) + + layer.register_parameter("weight_shape", weight_shape) + set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) + + meta = Parameter( + torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + dtype=torch.int16, + ), + requires_grad=False, + ) + set_weight_attrs( + meta, + { + "input_dim": 0, + "packed_dim": 1, + "pack_factor": 1, + "output_dim": 1, + "marlin_tile_size": 2, + "weight_loader": weight_loader + }, + ) + layer.register_parameter("meta", meta) + + max_workspace_size = ( + output_size_per_partition // + GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL + workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int), + requires_grad=False) + layer.workspace = workspace + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + qweight = layer.weight_packed + meta = layer.meta + scales = layer.scale_packed + workspace = layer.workspace + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales, + workspace, self.num_bits, size_m, + size_n, size_k) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) + return output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index fcc664910..b2bec9b60 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -6,6 +6,14 @@ from pydantic import BaseModel, Field from torch.nn import Module +class CompressionFormat(Enum): + dense = "dense" + sparse_bitmask = "sparse-bitmask" + int_quantized = "int-quantized" + pack_quantized = "pack-quantized" + marlin_24 = "marlin-24" + + class QuantizationType(str, Enum): """ Enum storing quantization type options -- GitLab From 1f12122b1714c855c02699775bcd2fb2b34f2577 Mon Sep 17 00:00:00 2001 From: zhyncs Date: Tue, 18 Jun 2024 00:40:35 +0800 Subject: [PATCH 067/376] [Misc] use AutoTokenizer for benchmark serving when vLLM not installed (#5588) --- benchmarks/backend_request_func.py | 29 ++++++++++++++++++++++++++++- benchmarks/benchmark_serving.py | 5 ++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 52386b8cd..4350b96b0 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -4,10 +4,13 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, Union import aiohttp +import huggingface_hub.constants from tqdm.asyncio import tqdm +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -388,6 +391,30 @@ def remove_prefix(text: str, prefix: str) -> str: return text +def get_model(pretrained_model_name_or_path: str): + if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + from modelscope import snapshot_download + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + return model_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, trust_remote_code: bool +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path): + pretrained_model_name_or_path = get_model( + pretrained_model_name_or_path) + return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + + ASYNC_REQUEST_FUNCS = { "tgi": async_request_tgi, "vllm": async_request_openai_completions, diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c136ee572..eef03e7d8 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -39,7 +39,10 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.transformers_utils.tokenizer import get_tokenizer +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer @dataclass -- GitLab From 728c4c8a063c25e7a20d6eda20a3f30873bda4c6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 18 Jun 2024 02:01:25 +0800 Subject: [PATCH 068/376] [Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814) Co-authored-by: Jiang Li Co-authored-by: Abhilash Majumder Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> --- .buildkite/run-xpu-test.sh | 14 + .buildkite/test-template.j2 | 5 + Dockerfile.xpu | 22 + benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- .../getting_started/xpu-installation.rst | 61 +++ docs/source/index.rst | 1 + requirements-xpu.txt | 11 + setup.py | 8 + vllm/_custom_ops.py | 3 +- vllm/_ipex_ops.py | 241 ++++++++++ vllm/attention/backends/ipex_attn.py | 355 +++++++++++++++ vllm/attention/selector.py | 15 +- vllm/config.py | 4 +- vllm/distributed/parallel_state.py | 2 +- vllm/engine/arg_utils.py | 11 +- vllm/engine/async_llm_engine.py | 11 + vllm/engine/llm_engine.py | 8 + vllm/executor/ray_utils.py | 4 +- vllm/executor/ray_xpu_executor.py | 401 +++++++++++++++++ vllm/executor/xpu_executor.py | 98 ++++ vllm/model_executor/custom_op.py | 8 +- vllm/model_executor/layers/activation.py | 35 ++ vllm/model_executor/layers/layernorm.py | 24 + .../model_executor/layers/rotary_embedding.py | 23 + .../layers/vocab_parallel_embedding.py | 2 +- vllm/utils.py | 31 +- vllm/worker/cache_engine.py | 7 +- vllm/worker/worker.py | 3 +- vllm/worker/xpu_model_runner.py | 417 ++++++++++++++++++ vllm/worker/xpu_worker.py | 193 ++++++++ 31 files changed, 1998 insertions(+), 24 deletions(-) create mode 100644 .buildkite/run-xpu-test.sh create mode 100644 Dockerfile.xpu create mode 100644 docs/source/getting_started/xpu-installation.rst create mode 100644 requirements-xpu.txt create mode 100644 vllm/_ipex_ops.py create mode 100644 vllm/attention/backends/ipex_attn.py create mode 100644 vllm/executor/ray_xpu_executor.py create mode 100644 vllm/executor/xpu_executor.py create mode 100644 vllm/worker/xpu_model_runner.py create mode 100644 vllm/worker/xpu_worker.py diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh new file mode 100644 index 000000000..22a7e7693 --- /dev/null +++ b/.buildkite/run-xpu-test.sh @@ -0,0 +1,14 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t xpu-test -f Dockerfile.xpu . + +# Setup cleanup +remove_docker_container() { docker rm -f xpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 4a20a462b..3bd1e90c2 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -45,6 +45,11 @@ steps: queue: intel command: bash .buildkite/run-cpu-test.sh + - label: "XPU Test" + agents: + queue: intel + command: bash .buildkite/run-xpu-test.sh + {% for step in steps %} - label: "{{ step.label }}" agents: diff --git a/Dockerfile.xpu b/Dockerfile.xpu new file mode 100644 index 000000000..c39e55167 --- /dev/null +++ b/Dockerfile.xpu @@ -0,0 +1,22 @@ +FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ + chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + rm /etc/apt/sources.list.d/intel-graphics.list && \ + wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ + echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ + chmod 644 /usr/share/keyrings/intel-graphics.gpg + +RUN apt-get update -y \ +&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-xpu.txt + +RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install + +CMD ["/bin/bash"] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 9937f8333..11d1bf7a4 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -191,7 +191,7 @@ if __name__ == '__main__': "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument('--block-size', type=int, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 48dfce428..ed65002bc 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -349,7 +349,7 @@ if __name__ == "__main__": "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument( "--enable-prefix-caching", diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst new file mode 100644 index 000000000..4f0d2da25 --- /dev/null +++ b/docs/source/getting_started/xpu-installation.rst @@ -0,0 +1,61 @@ +.. _installation_xpu: + +Installation with XPU +======================== + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +#. :ref:`Requirements ` +#. :ref:`Quick start using Dockerfile ` +#. :ref:`Build from source ` + +.. _xpu_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP) +* OneAPI requirements: oneAPI 2024.1 + +.. _xpu_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env + +.. _build_xpu_backend_from_source: + +Build from source +----------------- + +- First, install required driver and intel OneAPI 2024.1. + +- Second, install Python packages for vLLM XPU backend building: + +.. code-block:: console + + $ pip install --upgrade pip + $ pip install -v -r requirements-xpu.txt + +- Finally, build and install vLLM XPU backend: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE=xpu python setup.py install + +.. note:: + - FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. + diff --git a/docs/source/index.rst b/docs/source/index.rst index f5d862759..8795a865c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,7 @@ Documentation getting_started/cpu-installation getting_started/neuron-installation getting_started/tpu-installation + getting_started/xpu-installation getting_started/quickstart getting_started/debugging getting_started/examples/examples_index diff --git a/requirements-xpu.txt b/requirements-xpu.txt new file mode 100644 index 000000000..48d899ec7 --- /dev/null +++ b/requirements-xpu.txt @@ -0,0 +1,11 @@ +# Common dependencies +-r requirements-common.txt + +setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed. + +torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl +intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl + +triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + diff --git a/setup.py b/setup.py index 12a704e08..b2ae6def8 100644 --- a/setup.py +++ b/setup.py @@ -233,6 +233,10 @@ def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" +def _is_xpu() -> bool: + return VLLM_TARGET_DEVICE == "xpu" + + def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() @@ -337,6 +341,8 @@ def get_vllm_version() -> str: version += "+tpu" elif _is_cpu(): version += "+cpu" + elif _is_xpu(): + version += "+xpu" else: raise RuntimeError("Unknown runtime environment") @@ -386,6 +392,8 @@ def get_requirements() -> List[str]: requirements = _read_requirements("requirements-tpu.txt") elif _is_cpu(): requirements = _read_requirements("requirements-cpu.txt") + elif _is_xpu(): + requirements = _read_requirements("requirements-xpu.txt") else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2f84b8bde..ab2a67950 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -373,7 +373,8 @@ def reshape_and_cache_flash( kv_cache_dtype) -def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor, +def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py new file mode 100644 index 000000000..1e60e0848 --- /dev/null +++ b/vllm/_ipex_ops.py @@ -0,0 +1,241 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + logger.warning("Import error msg: %s", e.msg) + + +class ipex_ops: + + @staticmethod + def _reshape_activation_tensor( + x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + num = x.size(0) + d = x.size(1) // 2 + x = x.reshape(num, 2, d) + x1, x2 = torch.chunk(x, chunks=2, dim=1) + x1 = x1.reshape(num, d) + x2 = x2.reshape(num, d) + return x1, x2 + + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.silu_mul(x1, x2, out) + + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "none") + + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops._reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") + + def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + device=query.device, + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v1(out, query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, scale, + block_tables, context_lens, block_size, + max_context_len, alibi_slopes) + + def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + dtype=torch.int32, + device=query.device, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, block_tables, + context_lens, scale, block_size, + max_context_len, alibi_slopes) + + def rotary_embedding( + positions: torch.Tensor, # [batch_size, seq_len] + query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size] + key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size] + head_size: int, + cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] + is_neox: bool, + ) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[positions.long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions) + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[torch.add(positions, + cos_sin_cache_offsets).long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> None: + tmp = ipex.llm.functional.rms_norm(input, weight, epsilon) + out.copy_(tmp) + + def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float) -> None: + tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None, + epsilon, True) + input.copy_(tmp) + + def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, + ) -> None: + ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q, + seqlen_k, max_seqlen_q, + max_seqlen_k, pdropout, + softmax_scale, zero_tensors, + is_causal, return_softmax, gen_) + + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + kv_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping) + + @staticmethod + def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) + + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: torch.Tensor) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py new file mode 100644 index 000000000..f09b24f2a --- /dev/null +++ b/vllm/attention/backends/ipex_attn.py @@ -0,0 +1,355 @@ +""" Attention layer with torch scaled_dot_product_attention + and PagedAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from vllm._ipex_ops import ipex_ops +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) + +_PARTITION_SIZE = 512 + + +class IpexAttnBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "ipex-attn" + + @staticmethod + def get_impl_cls() -> Type["IpexAttnBackendImpl"]: + return IpexAttnBackendImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "IpexAttnMetadata": + return IpexAttnMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + PagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for IpexAttnBackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + slot_mapping: torch.Tensor + seq_lens: Optional[List[int]] + seqlen_q: Optional[torch.Tensor] + max_seqlen: Optional[int] + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[torch.Tensor]] = None + + @property + def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_decode_tokens == 0: + assert self.num_prefills > 0 + return self + + return None + + @property + def decode_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_prefills > 0: + assert self.num_decode_tokens == 0 + return None + + return self + + +class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + ) -> None: + assert blocksparse_params is None, ValueError( + "Torch SPDA does not support block-sparse attention.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + if kv_cache_dtype != "auto": + raise NotImplementedError( + "IPEX backend does not support FP8 KV cache. " + "Please use xFormers backend instead.") + + def split_kv_cache( + self, + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 1 + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Optional[torch.Tensor], + attn_metadata: IpexAttnMetadata, # type: ignore + kv_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with IPEX varlen_attention and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert kv_scale == 1.0 + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + if kv_cache is not None: + key_cache, value_cache = self.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + ipex_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, + kv_scale, + ) + + if attn_metadata.is_prompt: + assert attn_metadata.seq_lens is not None + if (kv_cache is None or attn_metadata.block_tables.numel() == 0): + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, + dim=1) + + if attn_metadata.attn_bias is None: + if self.alibi_slopes is not None: + att_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, None, dtype=query.dtype) + attn_metadata.attn_bias = att_masks + + output = torch.empty( + (num_tokens, self.num_heads, self.head_size), + dtype=query.dtype, + device=query.device) + ipex_ops.varlen_attention(query, + key, + value, + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None) + else: + # prefix-enabled attention + raise RuntimeError( + "IPEX backend doesn't support prefix decoding.") + + else: + # Decoding run. + max_seq_len = attn_metadata.max_decode_seq_len + output = torch.empty_like(query) + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory + # shortage. + use_v1 = (max_seq_len <= 8192 and + (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: + # Run PagedAttention V1. + ipex_ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ipex_ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype, + device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: List[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 8b07fb2d7..1d56d87cc 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -7,7 +7,7 @@ import torch import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu logger = init_logger(__name__) @@ -19,6 +19,7 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() PALLAS = enum.auto() + IPEX = enum.auto() @lru_cache(maxsize=None) @@ -58,12 +59,17 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: - # TODO: make XPU backend available here. assert is_cpu(), RuntimeError( "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend return TorchSDPABackend + elif backend == _Backend.IPEX: + assert is_xpu(), RuntimeError( + "IPEX attention backend is only used for the XPU device.") + logger.info("Using IPEX attention backend.") + from vllm.attention.backends.ipex_attn import IpexAttnBackend + return IpexAttnBackend elif backend == _Backend.FLASHINFER: logger.info("Using Flashinfer backend.") logger.warning("Eager mode is required for the Flashinfer backend. " @@ -107,6 +113,11 @@ def which_attn_to_use( logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA + if is_xpu(): + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) + return _Backend.IPEX + if is_tpu(): if selected_backend != _Backend.PALLAS: logger.info("Cannot use %s backend on TPU.", selected_backend) diff --git a/vllm/config.py b/vllm/config.py index 552d5033f..b1a3a82f5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu) + is_hip, is_neuron, is_tpu, is_xpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -757,6 +757,8 @@ class DeviceConfig: self.device_type = "tpu" elif is_cpu(): self.device_type = "cpu" + elif is_xpu(): + self.device_type = "xpu" else: # We don't call torch.cuda.is_available() here to # avoid initializing CUDA before workers are forked diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 16c5297af..02b0dcbcb 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -58,7 +58,7 @@ def _split_tensor_dict( # because it contains not only the device type but also the device # index (e.g. "cuda:0"). We only need the device type. # receiving side will set the device index. - device = "cpu" if value.is_cpu else "cuda" + device = value.device.type metadata_list.append( (key, TensorMetadata(device, value.dtype, value.size()))) tensor_list.append(value) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba53b5c86..9d04f1dc5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -501,11 +501,12 @@ class EngineArgs: 'Enabling this will use the fully sharded layers. ' 'At high sequence length, max rank or ' 'tensor parallel size, this is likely faster.')) - parser.add_argument("--device", - type=str, - default=EngineArgs.device, - choices=["auto", "cuda", "neuron", "cpu", "tpu"], - help='Device type for vLLM execution.') + parser.add_argument( + "--device", + type=str, + default=EngineArgs.device, + choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"], + help='Device type for vLLM execution.') # Related to Vision-language models such as llava parser = EngineArgs.add_cli_args_for_vlm(parser) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 03b6d03a9..ab312850b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -383,6 +383,17 @@ class AsyncLLMEngine: "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync + elif engine_config.device_config.device_type == "xpu": + if distributed_executor_backend is None: + from vllm.executor.xpu_executor import XPUExecutorAsync + executor_class = XPUExecutorAsync + elif distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync + executor_class = RayXPUExecutorAsync + else: + raise RuntimeError( + "Not supported distributed execution model on XPU device.") elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fd64337d4..eed9a17e4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -347,6 +347,14 @@ class LLMEngine: elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor + elif engine_config.device_config.device_type == "xpu": + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutor + executor_class = RayXPUExecutor + else: + from vllm.executor.xpu_executor import XPUExecutor + executor_class = XPUExecutor elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutor diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4704f5f1b..495fddd17 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from typing import List, Optional, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_ip, is_hip +from vllm.utils import get_ip, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -71,7 +71,7 @@ def initialize_ray_cluster( "serving.") # Connect to a ray cluster. - if is_hip(): + if is_hip() or is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py new file mode 100644 index 000000000..dd7c82289 --- /dev/null +++ b/vllm/executor/ray_xpu_executor.py @@ -0,0 +1,401 @@ +import asyncio +import os +import pickle +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set, + Tuple, Union) + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.executor.distributed_gpu_executor import ( # yapf: disable + DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayXPUExecutor(DistributedGPUExecutor): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + + self.model_config = model_config + self.cache_config = cache_config + self.load_config = load_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + def _init_executor(self) -> None: + pass + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - Tuple[num_gpu_blocks, num_cpu_blocks] + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers("determine_num_available_blocks", ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + return num_gpu_blocks, num_cpu_blocks + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) + else: + # Else, added to the list of workers. + self.workers.append(worker) + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + # TODO: add env var for xpu + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + def collect_arg_helper_func(**kwargs): + # avoid writing `{"name": value}` manually + return kwargs + + init_worker_all_kwargs = [] + + # Initialize the actual workers inside worker wrapper. + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ): + local_rank = node_workers[node_id].index(rank) + init_worker_all_kwargs.append( + collect_arg_helper_func( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, + )) + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Initialize the KV cache in all workers. + """ + + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. + logger.info("# GPU blocks: %d, " + "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> Set[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + - args/kwargs: All workers share the same args/kwargs + - args/kwargs and driver_args/driver_kwargs: Driver worker has + different args + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + assert self.forward_dag is not None + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, + **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import InputNode, MultiOutputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote. + bind( # type: ignore[attr-defined] + input_data) for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_method = make_async(self.driver_worker.execute_method) + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py new file mode 100644 index 000000000..d37200bd0 --- /dev/null +++ b/vllm/executor/xpu_executor.py @@ -0,0 +1,98 @@ +from typing import List, Optional + +import torch + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.gpu_executor import GPUExecutor +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class XPUExecutor(GPUExecutor): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + + model_config = _verify_and_get_model_config(model_config) + + self.model_config = model_config + self.cache_config = cache_config + self.load_config = load_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + self.speculative_config = None + + # Instantiate the worker and load the model to GPU. + self._init_executor() + + def _create_worker(self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None): + if self.speculative_config is None: + worker_module_name = "vllm.worker.xpu_worker" + worker_class_name = "XPUWorker" + else: + raise NotImplementedError( + "XPU does not support speculative decoding") + + wrapper = WorkerWrapperBase( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + ) + wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return wrapper.worker + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + output = self.driver_worker.execute_model(execute_model_req) + return output + + +class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req) + return output + + +def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: + if config.dtype == torch.bfloat16: + logger.warning( + "bfloat16 is not fully supported on XPU, casting to float16.") + config.dtype = torch.float16 + if not config.enforce_eager: + logger.warning( + "CUDA graph is not supported on XPU, fallback to the eager " + "mode.") + config.enforce_eager = True + return config diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 56aa629ae..0db72d8d9 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,6 @@ import torch.nn as nn -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu class CustomOp(nn.Module): @@ -29,9 +29,7 @@ class CustomOp(nn.Module): return self.forward_cuda(*args, **kwargs) def forward_xpu(self, *args, **kwargs): - # By default, we assume that XPU ops are compatible with CUDA ops. - # NOTE(woosuk): This is a placeholder for future extensions. - return self.forward_cuda(*args, **kwargs) + raise NotImplementedError def forward_cpu(self, *args, **kwargs): # By default, we assume that CPU ops are compatible with CUDA ops. @@ -58,5 +56,7 @@ class CustomOp(nn.Module): return self.forward_cpu elif is_tpu(): return self.forward_tpu + elif is_xpu(): + return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 4d076421f..eb0606948 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,6 +37,15 @@ class SiluAndMul(CustomOp): ops.silu_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + class GeluAndMul(CustomOp): """An activation function for GeGLU. @@ -71,6 +80,18 @@ class GeluAndMul(CustomOp): ops.gelu_tanh_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) + return out + def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' @@ -90,6 +111,13 @@ class NewGELU(CustomOp): ops.gelu_new(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_new(out, x) + return out + class FastGELU(CustomOp): @@ -105,6 +133,13 @@ class FastGELU(CustomOp): ops.gelu_fast(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_fast(out, x) + return out + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 4533adf8f..14f5e2378 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -67,6 +67,30 @@ class RMSNorm(CustomOp): ) return out + def forward_xpu( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm._ipex_ops import ipex_ops as ops + + if residual is not None: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + def extra_repr(self) -> str: s = f"hidden_size={self.weight.data.size(0)}" s += f", eps={self.variance_epsilon}" diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 5a4940acb..9c0a74cda 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -221,6 +221,29 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache, self.is_neox_style) return query, key + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm._ipex_ops import ipex_ops as ops + + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, + dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + def forward_tpu( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 60eb5b404..1a26c5c63 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -307,7 +307,7 @@ class VocabParallelEmbedding(torch.nn.Module): else: masked_input = input_ # Get the embeddings. - output_parallel = F.embedding(masked_input, self.weight) + output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) diff --git a/vllm/utils.py b/vllm/utils.py index 9b39ca77a..1adfa9218 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -160,6 +160,26 @@ def is_tpu() -> bool: return libtpu is not None +@lru_cache(maxsize=None) +def is_xpu() -> bool: + from importlib.metadata import version + is_xpu_flag = "xpu" in version("vllm") + # vllm is not build with xpu + if not is_xpu_flag: + return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + + @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -482,6 +502,9 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False + elif is_xpu(): + print_warning_once("Pin memory is not supported on XPU.") + return False elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False @@ -497,8 +520,12 @@ class CudaMemoryProfiler: def current_memory_usage(self) -> float: # Return the memory usage in bytes. - torch.cuda.reset_peak_memory_stats(self.device) - mem = torch.cuda.max_memory_allocated(self.device) + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + elif is_xpu(): + torch.xpu.reset_peak_memory_stats(self.device) + mem = torch.xpu.max_memory_allocated(self.device) return mem def __enter__(self): diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 341b177d4..fbd1343fe 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -4,7 +4,7 @@ from typing import List import torch from vllm.attention import get_attn_backend -from vllm.config import CacheConfig, ModelConfig, ParallelConfig +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available) @@ -25,10 +25,12 @@ class CacheEngine: cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, + device_config: DeviceConfig, ) -> None: self.cache_config = cache_config self.model_config = model_config self.parallel_config = parallel_config + self.device_config = device_config self.head_size = model_config.get_head_size() self.num_layers = model_config.get_num_layers(parallel_config) @@ -55,7 +57,8 @@ class CacheEngine: ) # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda") + self.gpu_cache = self._allocate_kv_cache( + self.num_gpu_blocks, self.device_config.device_type) self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") def _allocate_kv_cache( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7a378a862..f9b8a065a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -205,7 +205,8 @@ class Worker(WorkerBase): def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config) + self.parallel_config, + self.device_config) self.gpu_cache = self.cache_engine.gpu_cache def _warm_up_model(self) -> None: diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py new file mode 100644 index 000000000..f30de703e --- /dev/null +++ b/vllm/worker/xpu_model_runner.py @@ -0,0 +1,417 @@ +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn + +from vllm.attention import get_attn_backend +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + VisionLanguageConfig) +from vllm.distributed import broadcast_tensor_dict +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.sampling_params import SamplingParams +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad +from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata + +logger = init_logger(__name__) + +_PAD_SLOT_ID = -1 +_BATCH_SIZE_ALIGNMENT = 8 +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) +] + + +class XPUModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + *args, + **kwargs, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.lora_config = lora_config + self.load_config = load_config + self.cache_config = cache_config + self.vision_language_config = vision_language_config + self.is_driver_worker = is_driver_worker + + self.sliding_window = model_config.get_sliding_window() + self.device_config = device_config + self.device = self.device_config.device + + self.kv_cache_dtype = kv_cache_dtype + self.block_size = cache_config.block_size + self.max_context_len_to_capture = ( + self.model_config.max_context_len_to_capture + if self.model_config is not None else 0) + + self.attn_backend = get_attn_backend( + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + ) + + # Lazy initialization. + self.model: nn.Module # Set after init_Model + + def load_model(self) -> None: + with CudaMemoryProfiler() as m: + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + cache_config=self.cache_config, + ) + + self.model_memory_usage = m.consumed_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + @torch.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for vision encoding, which needs + # to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + + seq_data = SequenceData([0] * seq_len) + dummy_multi_modal_data = None + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=None, + multi_modal_data=dummy_multi_modal_data, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + self.execute_model(seqs, kv_caches) + torch.xpu.synchronize() + return + + def prepare_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, + Optional[torch.Tensor]]: + multi_modal_input = None + if self.is_driver_worker: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_input + ) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode(seq_group_metadata_list) + seq_lens = [] + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # subquery_lens is not needed if chunked prefill is not + # supported. Since CPU worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + pin_memory=False) + # Broadcast the metadata. + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": + sampling_metadata.selected_token_indices, + } + metadata_dict.update(attn_metadata.asdict_zerocopy()) + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict.pop("input_tokens") + input_positions = metadata_dict.pop("input_positions") + selected_token_indices = metadata_dict.pop( + "selected_token_indices") + attn_metadata = self.attn_backend.make_metadata(**metadata_dict) + sampling_metadata = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + + return (input_tokens, input_positions, attn_metadata, + sampling_metadata, multi_modal_input) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 + + seq_ids = list(seq_group_metadata.seq_data.keys()) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append(generation_token) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append(position) + + seq_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + seq_lens.append(seq_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + max_decode_seq_len = max(seq_lens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) + + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=None, + max_seqlen=None, + seq_lens_tensor=seq_lens_tensor, + max_decode_seq_len=max_decode_seq_len, + num_prefill_tokens=0, + num_decode_tokens=len(input_tokens), + num_prefills=0, + block_tables=block_tables, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + ) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[torch.Tensor], + ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, attn_metadata, sampling_metadata, + multi_modal_input + ) = self.prepare_input_tensors(seq_group_metadata_list) + + model_executable = self.model + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) + + hidden_states = model_executable(**execute_model_kwargs) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return None + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + return output + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + Optional[torch.Tensor]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + multi_modal_input_list: List[torch.Tensor] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + computed_len = seq_data.get_num_computed_tokens() + seq_len = len(prompt_tokens) + + seq_lens.append(seq_len) # Prompt token num + input_tokens.extend(prompt_tokens) # Token ids + + # Token position ids + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.extend(list(range(computed_len, seq_len))) + + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.extend([_PAD_SLOT_ID] * seq_len) + continue + + # Compute the slot mapping. + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, seq_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + start_idx = max(0, seq_len - self.sliding_window) + + for i in range(computed_len, seq_len): + if i < start_idx: + slot_mapping.append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + + num_prompt_tokens = len(input_tokens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) # type: ignore + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) # type: ignore + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) # type: ignore + + max_seqlen = max(seq_lens) + tmp = [0] + tmp.extend(seq_lens) + seqlen = torch.tensor(tmp) + seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=seqlen_q, + max_seqlen=max_seqlen, + seq_lens_tensor=None, + max_decode_seq_len=None, + num_prefills=len(seq_lens), + num_prefill_tokens=num_prompt_tokens, + num_decode_tokens=0, + block_tables=torch.tensor([], device=self.device, dtype=torch.int), + ) + return (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_input) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py new file mode 100644 index 000000000..773ee9f81 --- /dev/null +++ b/vllm/worker/xpu_worker.py @@ -0,0 +1,193 @@ +"""A XPU worker class.""" +import gc +import os +from typing import List, Optional, Tuple + +import intel_extension_for_pytorch # noqa: F401 +import oneccl_bindings_for_pytorch # noqa: F401 +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.utils import is_xpu +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.worker import Worker +from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.xpu_model_runner import XPUModelRunner + +logger = init_logger(__name__) + + +class XPUWorker(LoraNotSupportedWorkerBase, Worker): + """A worker class that executes (a partition of) the model on a GPU. + + Each worker is associated with a single XPU device. The worker is + responsible for maintaining the KV cache and executing the model on the + XPU. In case of distributed inference, each worker is assigned a partition + of the model. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + vision_language_config: Optional[VisionLanguageConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + is_driver_worker: bool = False, + ) -> None: + assert device_config.device_type == "xpu" + assert is_xpu() + + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.vision_language_config = vision_language_config + if self.vision_language_config: + assert not self.lora_config, ( + "To be tested: vision language model with LoRA settings.") + + self.model_runner = XPUModelRunner( # type: ignore + model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config=self.load_config, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, + vision_language_config=vision_language_config, + ) + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: CacheEngine + self.gpu_cache: List[torch.Tensor] + + def init_device(self) -> None: + if self.device_config.device.type == "xpu" and is_xpu(): + self.device = torch.device(f"xpu:{self.local_rank}") + torch.xpu.set_device(self.device) + torch.xpu.empty_cache() + self.init_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + self.init_worker_distributed_environment() + # Initialize the model. + set_random_seed(self.model_config.seed) + + # keep this method for `empty_cache` and `synchronize` api + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.xpu.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.xpu.synchronize() + used_memory = torch.xpu.memory_allocated() + total_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + free_gpu_memory = total_gpu_memory - used_memory + + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = self.get_cache_block_size_bytes() + num_gpu_blocks = int( + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + gc.collect() + torch.xpu.empty_cache() + return num_gpu_blocks, num_cpu_blocks + + def _warm_up_model(self) -> None: + # IPEX don't support capture graph yet + pass + + def init_worker_distributed_environment(self) -> None: + """Initialize the distributed environment.""" + + parallel_config = self.parallel_config + rank = self.rank + distributed_init_method = self.distributed_init_method + + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch " + "world size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + # use sockets as default Level zero IPC exchange backend. By + # default oneccl will use `drmfd` as mechanism which need extra + # dependency (libdrm and drm headers) on your system. + ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", + "sockets") + os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE + init_distributed_environment( + world_size=parallel_config.world_size, + rank=rank, + distributed_init_method=distributed_init_method, + local_rank=self.local_rank, + backend="ccl") + + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) -- GitLab From ab66536dbfedff4ffcbb6dc9f9a21d0a9ac0ec91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Tue, 18 Jun 2024 02:36:10 +0800 Subject: [PATCH 069/376] [CI/BUILD] Support non-AVX512 vLLM building and testing (#5574) --- .buildkite/run-cpu-test.sh | 5 ++++- Dockerfile.cpu | 4 ++++ cmake/cpu_extension.cmake | 13 ++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 5f9ca5d75..532d6ad88 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -4,17 +4,20 @@ set -ex # Try building the docker image docker build -t cpu-test -f Dockerfile.cpu . +docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test || true; } +remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Run the image docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" +docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 777bb0829..6e55203de 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -21,6 +21,10 @@ WORKDIR /workspace/vllm RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... +ARG VLLM_CPU_DISABLE_AVX512 +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index a644e5b6a..511e443f7 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -33,10 +33,21 @@ function (find_isa CPUINFO TARGET OUT) endif() endfunction() +function (is_avx512_disabled OUT) + set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) + if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + +is_avx512_disabled(AVX512_DISABLED) + find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) -if (AVX512_FOUND) +if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS "-mavx512f" "-mavx512vl" -- GitLab From 9e4e6fe2073ff5e4a747d5ce2a08d321268b7254 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 17 Jun 2024 11:41:08 -0700 Subject: [PATCH 070/376] [CI] the readability of benchmarking and prepare for dashboard (#5571) [CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571) --- .buildkite/nightly-benchmarks/README.md | 21 +- .../run-benchmarks-suite.sh | 6 +- .../convert-results-json-to-markdown.py | 260 ++++++++++-------- .../nightly-benchmarks/tests/descriptions.md | 67 +++++ .../{ => tests}/latency-tests.json | 2 +- .../{ => tests}/serving-tests.json | 2 +- .../{ => tests}/throughput-tests.json | 2 +- benchmarks/benchmark_latency.py | 2 +- 8 files changed, 232 insertions(+), 130 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/tests/descriptions.md rename .buildkite/nightly-benchmarks/{ => tests}/latency-tests.json (99%) rename .buildkite/nightly-benchmarks/{ => tests}/serving-tests.json (99%) rename .buildkite/nightly-benchmarks/{ => tests}/throughput-tests.json (99%) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 6a18be947..4036b32a4 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -13,9 +13,17 @@ This benchmark will be *triggered* upon: **Benchmarking Duration**: about 1hr. -## Configuring the workload for the quick benchmark +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. -The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`. + +## Configuring the workload + +The benchmarking workload contains three parts: +- Latency tests in `latency-tests.json`. +- Throughput tests in `throughput-tests.json`. +- Serving tests in `serving-tests.json`. + +See [descriptions.md](tests/descriptions.md) for detailed descriptions. ### Latency test @@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`: ```json [ - ... { "test_name": "latency_llama8B_tp1", "parameters": { @@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`: "num_iters": 15 } }, - ... ] ``` @@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t ``` [ - ... { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], @@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t "num_prompts": 200 } }, - ... ] ``` @@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. ## Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. -The JSON file is also attached within each buildkite job for further analysis. \ No newline at end of file +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh index 6cff6917f..021473f76 100644 --- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -343,9 +343,9 @@ main() { QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json - run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json - run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json # postprocess benchmarking results diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 75cff8434..9aa8162d1 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path import pandas as pd @@ -11,12 +12,13 @@ latency_results = [] latency_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "avg_latency": "Average latency (s)", - "P10": "P10 (s)", - "P25": "P25 (s)", - "P50": "P50 (s)", - "P75": "P75 (s)", - "P90": "P90 (s)", + "avg_latency": "Mean latency (ms)", + # "P10": "P10 (s)", + # "P25": "P25 (s)", + "P50": "Median", + # "P75": "P75 (s)", + # "P90": "P90 (s)", + "P99": "P99", } # thoughput tests and the keys that will be printed into markdown @@ -24,11 +26,11 @@ throughput_results = [] throughput_results_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "num_requests": "# of req.", - "total_num_tokens": "Total # of tokens", - "elapsed_time": "Elapsed time (s)", + # "num_requests": "# of req.", + # "total_num_tokens": "Total # of tokens", + # "elapsed_time": "Elapsed time (s)", "requests_per_second": "Tput (req/s)", - "tokens_per_second": "Tput (tok/s)", + # "tokens_per_second": "Tput (tok/s)", } # serving results and the keys that will be printed into markdown @@ -36,120 +38,148 @@ serving_results = [] serving_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "completed": "# of req.", + # "completed": "# of req.", "request_throughput": "Tput (req/s)", - "input_throughput": "Input Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", "mean_ttft_ms": "Mean TTFT (ms)", # do not say TTFT again to avoid the table getting too wide "median_ttft_ms": "Median", "p99_ttft_ms": "P99", - "mean_tpot_ms": "Mean TPOT (ms)", - "median_tpot_ms": "Median", - "p99_tpot_ms": "P99", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median", "p99_itl_ms": "P99", } -for test_file in results_folder.glob("*.json"): - - with open(test_file, "r") as f: - raw_result = json.loads(f.read()) - - if "serving" in str(test_file): - # this result is generated via `benchmark_serving.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - elif "latency" in f.name: - # this result is generated via `benchmark_latency.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # get different percentiles - for perc in [10, 25, 50, 75, 90]: - raw_result.update( - {f"P{perc}": raw_result["percentiles"][str(perc)]}) - - # add the result to raw_result - latency_results.append(raw_result) - continue - - elif "throughput" in f.name: - # this result is generated via `benchmark_throughput.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - throughput_results.append(raw_result) - continue - - print(f"Skipping {test_file}") - -latency_results = pd.DataFrame.from_dict(latency_results) -serving_results = pd.DataFrame.from_dict(serving_results) -throughput_results = pd.DataFrame.from_dict(throughput_results) - -# remapping the key, for visualization purpose -if not latency_results.empty: - latency_results = latency_results[list( - latency_column_mapping.keys())].rename(columns=latency_column_mapping) -if not serving_results.empty: - serving_results = serving_results[list( - serving_column_mapping.keys())].rename(columns=serving_column_mapping) -if not throughput_results.empty: - throughput_results = throughput_results[list( - throughput_results_column_mapping.keys())].rename( - columns=throughput_results_column_mapping) - -# get markdown tables -latency_md_table = tabulate(latency_results, - headers='keys', - tablefmt='pipe', - showindex=False) -serving_md_table = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) -throughput_md_table = tabulate(throughput_results, - headers='keys', - tablefmt='pipe', - showindex=False) - -# document the result -with open(results_folder / "benchmark_results.md", "w") as f: + +def read_markdown(file): + if os.path.exists(file): + with open(file, "r") as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose if not latency_results.empty: - f.write("## Latency tests\n") - f.write(latency_md_table) - f.write("\n") - if not throughput_results.empty: - f.write("## Throughput tests\n") - f.write(throughput_md_table) - f.write("\n") + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) if not serving_results.empty: - f.write("## Serving tests\n") - f.write(serving_md_table) - f.write("\n") + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + with open(results_folder / "benchmark_results.md", "w") as f: + + results = read_markdown( + "../.buildkite/nightly-benchmarks/tests/descriptions.md") + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md new file mode 100644 index 000000000..891e49170 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -0,0 +1,67 @@ + +## Latency tests + +This test suite aims to test vllm's end-to-end latency under a controlled setup. + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: end-to-end latency (mean, median, p99). + +### Latency benchmarking results + +{latency_tests_markdown_table} + +## Throughput tests + +This test suite aims to test vllm's throughput. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput. + +### Throughput benchmarking results + +{throughput_tests_markdown_table} + +## Serving tests + +This test suite aims to test vllm's real serving metrics. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +### Serving benchmarking results + +{serving_tests_markdown_table} + +## json version of the benchmarking tables + +This section contains the data of the markdown tables above in JSON format. +You can load the benchmarking tables into pandas dataframes as follows: + +```python +import json +import pandas as pd + +benchmarking_results_json = """The json string""" +benchmarking_results = json.loads(benchmarking_results_json) +latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) +throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) +serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) +``` + +The json string for all benchmarking tables: +```json +{benchmarking_results_in_json_string} +``` + +You can also check the raw experiment data in the Artifact tab of the Buildkite page. + diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/latency-tests.json rename to .buildkite/nightly-benchmarks/tests/latency-tests.json index 294a8c439..06488cd79 100644 --- a/.buildkite/nightly-benchmarks/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -29,4 +29,4 @@ "num-iters": 15 } } -] +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/serving-tests.json rename to .buildkite/nightly-benchmarks/tests/serving-tests.json index bb6746612..86a0fefa3 100644 --- a/.buildkite/nightly-benchmarks/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -56,4 +56,4 @@ "num_prompts": 200 } } -] +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/throughput-tests.json rename to .buildkite/nightly-benchmarks/tests/throughput-tests.json index db4f908d7..41ac13574 100644 --- a/.buildkite/nightly-benchmarks/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -32,4 +32,4 @@ "backend": "vllm" } } -] +] \ No newline at end of file diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 11d1bf7a4..767afd21a 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -98,7 +98,7 @@ def main(args: argparse.Namespace): for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90] + percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) print(f'Avg latency: {np.mean(latencies)} seconds') for percentage, percentile in zip(percentages, percentiles): -- GitLab From 1b44aaf4e3559e4e321f32715b08f1aa7e4f3d50 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 17 Jun 2024 14:35:04 -0700 Subject: [PATCH 071/376] [bugfix][distributed] fix 16 gpus local rank arrangement (#5604) --- vllm/executor/ray_gpu_executor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 89d1c4ac7..843332e5e 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -137,6 +137,12 @@ class RayGPUExecutor(DistributedGPUExecutor): for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] node_gpus[node_id].extend(gpu_ids) for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) -- GitLab From e441bad674e6dca076a145ca63f72100318c51e5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 17 Jun 2024 15:08:05 -0700 Subject: [PATCH 072/376] [Optimization] use a pool to reuse LogicalTokenBlock.token_ids (#5584) --- vllm/block.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index 2cc6b947f..e7fb29c8c 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -1,5 +1,7 @@ """Token blocks.""" -from typing import List +import weakref +from collections import defaultdict +from typing import Dict, List from vllm.utils import Device @@ -7,6 +9,35 @@ _BLANK_TOKEN_ID = -1 DEFAULT_LAST_ACCESSED_TIME = -1 +TokensBlock = List[int] + + +class BlockPool: + """A pool of physical blocks. + When requests come, we create a lot of logical blocks; + when requests are done, we destroy a lot of logical blocks. + It turns out that creating and destroying logical blocks can be expensive, + especially for the `token_ids` field, which is a list of integers. + To avoid this overhead, we use a pool to manage the logical blocks. + When an old request is done and a new request comes, we can reuse the + logical blocks from the old request to feed the new request. + """ + + def __init__(self) -> None: + # block size to list of token blocks + self.pool: Dict[int, List[TokensBlock]] = defaultdict(list) + + def alloc_block(self, block_size: int) -> TokensBlock: + if block_size in self.pool and self.pool[block_size]: + return self.pool[block_size].pop() + return [_BLANK_TOKEN_ID] * block_size + + def del_block(self, block: TokensBlock) -> None: + self.pool[len(block)].append(block) + + +_BLOCK_POOL = BlockPool() + class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. @@ -23,7 +54,13 @@ class LogicalTokenBlock: self.block_number = block_number self.block_size = block_size - self.token_ids = [_BLANK_TOKEN_ID] * block_size + self.token_ids = _BLOCK_POOL.alloc_block(block_size) + # this finalizer is used to return the block to the pool when the object is deleted # noqa + # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa + # i.e. `self.token_ids` may be deleted before `self`, and we lose + # the opportunity to return the block to the pool + self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block, + self.token_ids) self.num_tokens = 0 def is_empty(self) -> bool: -- GitLab From a3e8a05d4c1b79dd44eb92bb6f57eb40c3fbdb21 Mon Sep 17 00:00:00 2001 From: Bruce Fontaine Date: Mon, 17 Jun 2024 15:26:41 -0700 Subject: [PATCH 073/376] [Bugfix] Fix KV head calculation for MPT models when using GQA (#5142) --- vllm/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index b1a3a82f5..d95faf52d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -302,7 +302,11 @@ class ModelConfig: return 1 # For DBRX and MPT - if self.hf_config.model_type in ["dbrx", "mpt"]: + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": return getattr(self.hf_config.attn_config, "kv_n_heads", self.hf_config.num_attention_heads) -- GitLab From 26e1188e51aca3b76184671d804a8b17c294b610 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Mon, 17 Jun 2024 16:16:10 -0700 Subject: [PATCH 074/376] [Fix] Use utf-8 encoding in entrypoints/openai/run_batch.py (#5606) --- vllm/entrypoints/openai/run_batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index b0c0f4ad2..2f1870187 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -58,7 +58,7 @@ async def read_file(path_or_url: str) -> str: session.get(path_or_url) as resp: return await resp.text() else: - with open(path_or_url, "r") as f: + with open(path_or_url, "r", encoding="utf-8") as f: return f.read() @@ -71,7 +71,7 @@ async def write_file(path_or_url: str, data: str) -> None: # We should make this async, but as long as this is always run as a # standalone program, blocking the event loop won't effect performance # in this particular case. - with open(path_or_url, "w") as f: + with open(path_or_url, "w", encoding="utf-8") as f: f.write(data) -- GitLab From fa9e3852290ecb6eaae45befbd629bb060f57fb7 Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Mon, 17 Jun 2024 19:29:09 -0700 Subject: [PATCH 075/376] [Speculative Decoding 1/2 ] Add typical acceptance sampling as one of the sampling techniques in the verifier (#5131) --- .../test_typical_acceptance_sampler.py | 464 ++++++++++++++++++ .../layers/rejection_sampler.py | 174 +------ .../layers/spec_decode_base_sampler.py | 206 ++++++++ .../layers/typical_acceptance_sampler.py | 186 +++++++ 4 files changed, 866 insertions(+), 164 deletions(-) create mode 100644 tests/samplers/test_typical_acceptance_sampler.py create mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py create mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py new file mode 100644 index 000000000..87cf37bc9 --- /dev/null +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -0,0 +1,464 @@ +"""Tests for rejection sampling.""" + +import pytest +import torch + +from vllm.model_executor.layers.typical_acceptance_sampler import ( + TypicalAcceptanceSampler) +from vllm.model_executor.utils import set_random_seed + +CUDA_DEVICES = [f"cuda:{i}" for i in range(1)] + + +def get_zero_temperature_prob_dist(batch_size, k, vocab_size): + """ + Generates a fake temperature zero probability distribution. + Returns: + 1. A fake temperature zero probability distribution of shape + [batch_size, k, vocab_size] + 2. Tensor of shape [batch_size, k] containing the token ids + of the probability 1.0 tokens at each position. + """ + # Simulate temperature 0 probability distribution for target probabilities + # and create target probabilities such that only 1 token id has + # probability 1.0 + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + probs = torch.rand(batch_size, k, vocab_size) + _, zero_temperature_token_ids = torch.max(probs, dim=-1) + # set the probability of the tokens with ids in zero_temperature_token_ids + # to 1 and the rest to 0. + target_probs = torch.zeros_like(probs).scatter_( + -1, zero_temperature_token_ids.unsqueeze(-1), 1.0) + return target_probs, zero_temperature_token_ids + + +def get_draft_token_ids(batch_size: int, k: int, vocab_size: int, + token_ids_to_exclude: torch.Tensor): + """ + Returns a tensor of shape [batch_size, k] of fake draft token ids + drawn randomly from a vocab of size vocab_size. We however ensure + that token_ids from token_ids_to_exclude are excluded at the + corresponding positions. + """ + draft_token_ids = torch.empty(batch_size, k, dtype=torch.long) + for i in range(batch_size): + for j in range(k): + # Generate a random token ID excluding token_ids_to_exclude[i, j] + while True: + token_id = torch.randint(0, vocab_size, (1, )).item() + if token_id != token_ids_to_exclude[i, j]: + draft_token_ids[i, j] = token_id + break + return draft_token_ids + + +@pytest.mark.parametrize("k", list(range(1, 6))) +@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) +@pytest.mark.parametrize("batch_size", list(range(1, 32))) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, + device: str): + """ + Tests that the TypicalAcceptancSampler forward succeeds for + different combinations of k, vocab_size, batch_size and num devices. + """ + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler() + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + # Verify that sampling succeeds for all cases. + typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids) + + +@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) +@pytest.mark.parametrize("which_token_ids", + ["bonus_token_ids", "draft_token_ids"]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_raises_when_vocab_oob(above_or_below_vocab_range: str, + which_token_ids: str, device: str): + """ + Tests that we throw an exception of the token ids fall outside + the bound of the provided vocabulary. + """ + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + # Verify that appropriate exceptions are thrown for out + # of bound vocabs. + oob_token_ids = None + if which_token_ids == "bonus_token_ids": + oob_token_ids = bonus_token_ids + elif which_token_ids == "draft_token_ids": + oob_token_ids = draft_token_ids + else: + raise AssertionError() + + if above_or_below_vocab_range == "above": + rogue_token_id = vocab_size + 1 + elif above_or_below_vocab_range == "below": + rogue_token_id = -1 + else: + raise AssertionError() + + oob_token_ids[0][0] = rogue_token_id + + with pytest.raises(AssertionError): + typical_acceptance_sampler(target_probs, bonus_token_ids, + draft_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_uniform_target_distribution_accepts_all_tokens( + seed: int, disable_bonus_tokens: bool, device: str): + """ + Test the TypicalAcceptanceSampler with a uniform target probability + distribution. + + This test verifies that when provided with a uniform target probability + distribution, the TypicalAcceptanceSampler accepts all draft tokens. The + entropy of the uniform target distribution being high should lead to all + draft tokens being accepted. The test also ensures that the behavior + regarding bonus tokens is consistent with the `disable_bonus_tokens` + flag. + """ + set_random_seed(seed) + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + # We are using a uniform target probability distribution. + # For a uniform distribution the entropy is very high and it + # should lead to all draft tokens being accepted. Verify that. + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze()) + + assert torch.all(output_token_ids[:, :k] == draft_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_temperature_zero_target_distribution(seed: int, + disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with a zero-temperature target + probability distribution. + + This test verifies that when using a zero-temperature target probability + distribution, where only one token has a probability of 1.0, the + TypicalAcceptanceSampler correctly rejects all draft tokens that do not + match this probability. Additionally, it ensures that when all draft + tokens are rejected, the sampler falls back to greedy sampling to select a + single token from the target distribution. + """ + set_random_seed(seed) + k = 3 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Simulate temperature 0 probability distribution for target probabilities + # and create target probabilities such that only 1 token id has + # probability 1.0 + target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist( + batch_size, k, vocab_size) + # Populate draft_token_ids such that they exclude the token_ids + # with probability = 1.0 + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + # The target probaility distribution is a temperature zero distribution + # with zero entroy. Since our draft token ids don't match the probability + # 1.0 tokens in the target distribution we will reject all of them and + # fallback to the greedy sampling for selecting 1 token for each sequence. + # Verify the same. + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, -1] == -1) + assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:, + 0]) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with a mixed target probability + distribution. + + This test ensures that the TypicalAcceptanceSampler handles a mixed + target probability distribution correctly. Specifically, it uses a + zero-temperature distribution for some sequences and a uniform + distribution for others. The test verifies that: + + - For sequences with a zero-temperature distribution, only the token + with a probability of 1.0 is accepted, and all other tokens are rejected. + - For sequences with a uniform distribution, all draft tokens are + accepted. + - When `disable_bonus_tokens` is False, the bonus tokens are also accepted + for sequences with a uniform distribution. + """ + set_random_seed(seed) + k = 3 + batch_size = 4 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # For sequences 0 and 2 set the distribution to a temperature + # zero distribution. For sequences 1 and 3 set it to a uniform + # distribution. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32) + target_probs[[1, 3]] = uniform_probs + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + # verify the shape of output_token_ids + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + # For sequences 0 and 2 verify that only 1 token is accepted + # which is the token with probability 1.0 in the target distribution + # at position 0. + assert torch.all(output_token_ids[[0, 2], 1:] == -1) + assert (torch.all(output_token_ids[[0, 2], + 0] == zero_temperature_token_ids[[0, 2], + 0])) + # For sequences 1 and 3 verify that all tokens are accepted since the + # target probability distribution is uniform. In addition verify that + # if disable_bonus_tokens is false then we also accept the bonus tokens. + assert torch.all( + output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :]) + if disable_bonus_tokens: + assert torch.all(output_token_ids[[1, 3], -1] == -1) + else: + assert torch.all(output_token_ids[[1, 3], -1] != -1) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler's behavior when only a subset of draft + tokens should be accepted. + + This test verifies that the TypicalAcceptanceSampler correctly accepts or + rejects draft tokens based on a zero-temperature target probability + distribution. Specifically, it ensures that: + + - When all draft tokens match tokens with a probability of 1.0 in the + target distribution, all draft tokens are accepted. + - When only some draft tokens match tokens with a probability of 1.0 in + the target distribution, only those matching tokens are accepted, and the + rest are rejected. + """ + set_random_seed(seed) + k = 5 + batch_size = 1 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Create a temperature zero target probability distribution and ensure + # all draft token ids correspond to the tokens with 1.0 probability. + # Verify that all of them are accepted. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + draft_token_ids = zero_temperature_token_ids + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids) + # Next only keep the first 2 draft tokens same as the zero temperature + # tokens. For the remaining 3 choose some other tokens. In the + # response we will expect the first 2 tokens to be the same as the + # draft tokens and the rest as -1 + draft_token_ids_to_replace = get_draft_token_ids( + batch_size, k, vocab_size, zero_temperature_token_ids) + draft_token_ids = torch.cat( + (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2]) + assert torch.all(output_token_ids[:, -3:] == -1) + + +@pytest.mark.parametrize("seed", list(range(1))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_accept_tokens_set_non_default_posteriors(seed: int, + disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler with custom posterior thresholds and + alpha values. This test verifies that by modifying the posterior + thresholds and alpha values we can change the acceptance behavior of the + sampler. + """ + set_random_seed(seed) + k = 5 + batch_size = 1 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + # Simulate temperature 0 probability distribution for target + # probabilities and create target probabilities such that only 1 token + # id has probability 1.0 and others have a very low probability of + # 0.00001. Populate draft_token_ids such that they exclude the token_ids + # with probability = 1.0. Without any changes to the posterior thresholds + # none of the draft tokens are accepted. + target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist( + batch_size, k, vocab_size)) + target_probs[target_probs == 0] = 0.00001 + draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, + zero_temperature_token_ids) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 1:-1] == -1) + + # Change the posterior threshold values to 0.0 so that we will + # now accept even draft tokens with very low probability in the + # target distribution. Simulate and verify the same. + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, + disable_bonus_tokens=disable_bonus_tokens, + posterior_threshold=0.0, + posterior_alpha=0.0) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + output_token_ids = typical_acceptance_sampler(target_probs, + bonus_token_ids, + draft_token_ids) + assert output_token_ids.shape[0] == batch_size + assert output_token_ids.shape[1] == (k + 1) + assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) + if disable_bonus_tokens: + assert torch.all(output_token_ids[:, -1] == -1) + else: + assert torch.all(output_token_ids[:, -1] == bonus_token_ids) + + +@pytest.mark.parametrize("seed", list(range(10))) +@pytest.mark.parametrize("disable_bonus_tokens", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool, + device: str): + """ + Test the TypicalAcceptanceSampler's method for generating + replacement token IDs. + + This test verifies that the `_replacement_token_ids` method of the + TypicalAcceptanceSampler correctly identifies the token IDs to be used + as replacements based on the target probability distribution. + Specifically, it ensures that the method correctly identifies the + tokens with the highest probability for each sequence in the batch. + """ + set_random_seed(seed) + k = 10 + batch_size = 5 + vocab_size = 30_000 + torch.set_default_device(device) + typical_acceptance_sampler = TypicalAcceptanceSampler( + strict_mode=True, disable_bonus_tokens=disable_bonus_tokens) + typical_acceptance_sampler.init_gpu_tensors(rank=0) + target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + expected_replacement_tokens = -torch.ones( + (batch_size, k), dtype=torch.long) + expected_replacement_tokens[:, 0] = torch.argmax(target_probs[:, 0, :], + dim=1) + actual_replacement_tokens = ( + typical_acceptance_sampler._replacement_token_ids(target_probs)) + assert torch.all(expected_replacement_tokens == actual_replacement_tokens) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index a80703155..fe9b2fac1 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,12 +1,15 @@ from functools import cached_property -from typing import Optional, Tuple +from typing import Tuple import torch import torch.jit import torch.nn as nn +from vllm.model_executor.layers.spec_decode_base_sampler import ( + SpecDecodeBaseSampler) -class RejectionSampler(nn.Module): + +class RejectionSampler(SpecDecodeBaseSampler, nn.Module): """Apply modified rejection sampling as described in "Accelerating Large Language Model Decoding with Speculative Sampling" https://arxiv.org/pdf/2302.01318.pdf. @@ -22,39 +25,11 @@ class RejectionSampler(nn.Module): Require when bonus tokens will cause corrupt KV cache for proposal methods that require KV cache. strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. + during sampling. This catches correctness issues but adds + nontrivial latency. """ - super().__init__() - self._disable_bonus_tokens = disable_bonus_tokens - self._strict_mode = strict_mode - - # NOTE: A "bonus token" is accepted iff all proposal tokens are - # accepted. There is always only one possible bonus token. We store this - # value in a variable for readability. - self._num_bonus_tokens = 1 - - self.num_accepted_tokens: Optional[torch.Tensor] = None - self.num_emitted_tokens: Optional[torch.Tensor] = None - self.num_draft_tokens: int = 0 - - def init_gpu_tensors(self, rank: int) -> None: - assert self.num_accepted_tokens is None - device = f"cuda:{rank}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - @property - def probs_dtype(self): - return torch.float32 - - @property - def token_id_dtype(self): - return torch.int64 + SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode) + nn.Module.__init__(self) def forward( self, @@ -100,15 +75,8 @@ class RejectionSampler(nn.Module): # Only perform shape/dtype/device checking in strict mode, as it adds # overhead. if self._strict_mode: - self._raise_if_incorrect_shape(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_incorrect_dtype(target_probs, bonus_token_ids, + self._raise_if_incorrect_input(target_probs, bonus_token_ids, draft_probs, draft_token_ids) - self._raise_if_inconsistent_device(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], - bonus_token_ids, - draft_token_ids) accepted, recovered_token_ids = self._batch_modified_rejection_sampling( target_probs, @@ -272,128 +240,6 @@ class RejectionSampler(nn.Module): """ return torch.finfo(self.probs_dtype).tiny - def _create_output( - self, - accepted: torch.Tensor, # [batch_size, k] - recovered_token_ids: torch.Tensor, # [batch_size, k] - draft_token_ids: torch.Tensor, # [batch_size, k] - bonus_token_ids: torch.Tensor, # [batch_size] - ) -> torch.Tensor: - """Format output. Returns a matrix of token ids. When - a token is rejected via rejection sampling, all subsequent - token ids are set to -1 for the sequence. - - shape = [batch_size, k + num_bonus_tokens] - """ - bonus_token_ids = bonus_token_ids.squeeze() - batch_size, k = recovered_token_ids.shape - - # Determine the index of the first False value for each row. - limits = (accepted == 0).max(1).indices - limits[~(accepted == 0).any(1)] = k - - # Create masks using the indices. - indices = torch.arange(k, device=accepted.device).unsqueeze(0) - accepted_mask = indices < limits.unsqueeze(1) - after_false_mask = indices == limits.unsqueeze(1) - - # Create an extended output tensor - output_with_bonus_tokens = -torch.ones( - (batch_size, k + self._num_bonus_tokens), - dtype=self.token_id_dtype, - device=accepted.device) - output = output_with_bonus_tokens[:, :k] - - # Fill in the first k columns of the output tensor using masks and data - # tensors. - torch.where(accepted_mask, - draft_token_ids, - -torch.ones_like(draft_token_ids), - out=output) - - # Fill the last column. - # We check output directly as accepted may have True values inconsistent - # with causal acceptance. - output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, - bonus_token_ids, -1) - - # We disable bonus tokens because it causes corrupt KV cache for - # proposal methods that require KV cache. We can fix it by "prefilling" - # the bonus token in the proposer. The following issue tracks the fix. - # https://github.com/vllm-project/vllm/issues/4212 - if self._disable_bonus_tokens: - output_with_bonus_tokens[:, -1] = -1 - - # Fill the recovered token ids. - output.mul_(~after_false_mask).add_( - recovered_token_ids.mul(after_false_mask)) - - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k - - return output_with_bonus_tokens - - def _raise_if_incorrect_shape( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - (target_batch_size, num_target_probs, - target_vocab_size) = target_probs.shape - bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape - draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape - draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - assert draft_token_ids_batch_size == draft_batch_size - assert num_draft_token_ids == num_draft_probs - - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - - def _raise_if_incorrect_dtype( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert all(probs.dtype == self.probs_dtype - for probs in [target_probs, draft_probs]) - assert all(token_ids.dtype == self.token_id_dtype - for token_ids in [bonus_token_ids, draft_token_ids]) - - def _raise_if_inconsistent_device( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - devices = [ - t.device for t in - [target_probs, bonus_token_ids, draft_probs, draft_token_ids] - ] - assert all([devices[0] == device for device in devices]) - - def _raise_if_out_of_bounds_vocab( - self, - vocab_size: int, - bonus_token_ids: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert torch.all(bonus_token_ids < vocab_size) - assert torch.all(bonus_token_ids >= 0) - assert torch.all(draft_token_ids < vocab_size) - assert torch.all(draft_token_ids >= 0) - # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead that skips the sync. diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py new file mode 100644 index 000000000..9856a7e7d --- /dev/null +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -0,0 +1,206 @@ +from typing import Optional + +import torch + + +class SpecDecodeBaseSampler(): + """Base class for samplers used for Speculative Decoding verification + step. + """ + + def __init__(self, + disable_bonus_tokens: bool = True, + strict_mode: bool = False): + """Base class constructor. + Args: + disable_bonus_tokens: Whether or not to disable the bonus token. + Require when bonus tokens will cause corrupt KV cache for + proposal methods that require KV cache. + strict_mode: Whether or not to perform shape/device/dtype checks + during sampling. This catches correctness issues but adds + nontrivial latency. + """ + super().__init__() + self._disable_bonus_tokens = disable_bonus_tokens + self._strict_mode = strict_mode + + # NOTE: A "bonus token" is accepted iff all proposal tokens are + # accepted. There is always only one possible bonus token. We store this + # value in a variable for readability. + self._num_bonus_tokens = 1 + + self.num_accepted_tokens: Optional[torch.Tensor] = None + self.num_emitted_tokens: Optional[torch.Tensor] = None + self.num_draft_tokens: int = 0 + + def init_gpu_tensors(self, rank: int) -> None: + assert self.num_accepted_tokens is None + device = f"cuda:{rank}" + self.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + self.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + + @property + def probs_dtype(self): + return torch.float32 + + @property + def token_id_dtype(self): + return torch.int64 + + def _create_output( + self, + accepted: torch.Tensor, # [batch_size, k] + substitute_token_ids: torch.Tensor, # [batch_size, k] + draft_token_ids: torch.Tensor, # [batch_size, k] + bonus_token_ids: torch.Tensor, # [batch_size] + ) -> torch.Tensor: + """Format output. Returns a matrix of token ids. When + a token is rejected via sampling, all subsequent token ids are + set to -1 for the sequence. + + Args: + accepted: A boolean tensor indicating if the corresponding + draft token in draft_token_ids should be accepted or not. + substitute_token_ids: A tensor of token_ids that can be used + as substitutes for the draft token ids if the proposed token + is rejected. + draft_token_ids: A tensor of token ids speculated by the + draft model. + bonus_token_ids: Token ids to use as the bonus token if + all the draft tokens are accepted. + Returns: + A tensor containing the accepted token ids. The shape of the + tensor is [batch_size, k + num_bonus_tokens] + """ + batch_size, k = substitute_token_ids.shape + bonus_token_ids = bonus_token_ids.squeeze() + # Determine the index of the first False value for each row. + limits = (accepted == 0).max(1).indices + limits[~(accepted == 0).any(1)] = k + + # Create masks using the indices. + indices = torch.arange(k, device=accepted.device).unsqueeze(0) + accepted_mask = indices < limits.unsqueeze(1) + after_false_mask = indices == limits.unsqueeze(1) + + # Create an extended output tensor + output_with_bonus_tokens = -torch.ones( + (batch_size, k + self._num_bonus_tokens), + dtype=self.token_id_dtype, + device=accepted.device) + output = output_with_bonus_tokens[:, :k] + + # Fill in the first k columns of the output tensor using masks and data + # tensors. + output[:, :k] = torch.where(accepted_mask, draft_token_ids, + -torch.ones_like(draft_token_ids)) + + # Fill the last column. + # We check output directly as accepted may have True values inconsistent + # with causal acceptance. + output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, + bonus_token_ids, -1) + + # We disable bonus tokens because it causes corrupt KV cache for + # proposal methods that require KV cache. We can fix it by "prefilling" + # the bonus token in the proposer. The following issue tracks the fix. + # https://github.com/vllm-project/vllm/issues/4212 + if self._disable_bonus_tokens: + output_with_bonus_tokens[:, -1] = -1 + + # Fill the recovered token ids. + output.mul_(~after_false_mask).add_( + substitute_token_ids.mul(after_false_mask)) + + self.num_accepted_tokens += accepted.sum() + self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + self.num_draft_tokens += batch_size * k + + return output_with_bonus_tokens + + def _raise_if_incorrect_input( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + self._raise_if_incorrect_shape(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_incorrect_dtype(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_inconsistent_device(target_probs, draft_token_ids, + bonus_token_ids, draft_probs) + self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], + draft_token_ids, bonus_token_ids) + + def _raise_if_incorrect_shape( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + (target_batch_size, num_target_probs, + target_vocab_size) = target_probs.shape + + # validate the shape of draft token ids. + draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape + assert draft_token_ids_batch_size == target_batch_size + assert num_draft_token_ids == num_target_probs + + # validate the shape of bonus token ids + bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape + assert bonus_batch_size == target_batch_size + assert num_bonus_tokens == self._num_bonus_tokens + + # validate the shape of draft probs if it is set + if draft_probs is not None: + (draft_batch_size, num_draft_probs, + draft_vocab_size) = draft_probs.shape + assert draft_batch_size == target_batch_size + assert num_draft_probs == num_target_probs + assert (draft_vocab_size == target_vocab_size + ), f"{draft_vocab_size=} {target_vocab_size=}" + + def _raise_if_incorrect_dtype( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + assert target_probs.dtype == self.probs_dtype + assert draft_token_ids.dtype == self.token_id_dtype + assert bonus_token_ids.dtype == self.token_id_dtype + if draft_probs is not None: + assert draft_probs.dtype == self.probs_dtype + + def _raise_if_inconsistent_device( + self, + target_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: Optional[torch.Tensor] = None, + ) -> None: + devices = [ + t.device for t in + [target_probs, bonus_token_ids, draft_probs, draft_token_ids] + if t is not None + ] + assert all([devices[0] == device for device in devices]) + + def _raise_if_out_of_bounds_vocab( + self, + vocab_size: int, + draft_token_ids: torch.Tensor, + bonus_token_ids: torch.Tensor, + ) -> None: + assert torch.all(bonus_token_ids < vocab_size) + assert torch.all(bonus_token_ids >= 0) + assert torch.all(draft_token_ids < vocab_size) + assert torch.all(draft_token_ids >= 0) diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py new file mode 100644 index 000000000..f12d6a03b --- /dev/null +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -0,0 +1,186 @@ +import torch +import torch.jit +import torch.nn as nn + +from vllm.model_executor.layers.spec_decode_base_sampler import ( + SpecDecodeBaseSampler) + + +class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module): + """Apply typical acceptance sampling as described in section 3.3.1 in + "MEDUSA: Simple LLM Inference Acceleration Framework with + Multiple Decoding Heads" + https://arxiv.org/pdf/2401.10774 + """ + + def __init__( + self, + disable_bonus_tokens: bool = False, + strict_mode: bool = False, + posterior_threshold: float = 0.09, + posterior_alpha: float = 0.3, + ): + """Create a Typical Acceptance Sampler. + + Args: + disable_bonus_tokens: Whether or not to disable the bonus token. + Require when bonus tokens will cause corrupt KV cache for + proposal methods that require KV cache. + strict_mode: Whether or not to perform shape/device/dtype checks + during sampling. This catches correctness issues but adds + nontrivial latency. + posterior_threshold : A threshold value that sets a lower bound + on the posterior probability of a token in target model for it + to be accepted. Default is 0.09 + posterior_alpha : A scaling factor for the entropy-based + threshold in typical acceptance sampling. Typically defaults to + sqrt of posterior_threshold and is set to 0.3. + """ + SpecDecodeBaseSampler.__init__( + self, + disable_bonus_tokens=disable_bonus_tokens, + strict_mode=strict_mode) + nn.Module.__init__(self) + self._posterior_threshold = posterior_threshold + self._posterior_alpha = posterior_alpha + + def forward( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> torch.Tensor: + """Sample token ids using typical acceptance sampling. This accepts + or rejects tokens proposed by the draft model using the probability + of each token according to the draft and target models. + + In the worst case where all draft tokens are rejected, it is guaranteed + one token will be emitted. + + In the case where all draft tokens are accepted, the bonus token will be + accepted conditioned on self._disable_bonus_tokens being false. + + Args: + target_probs: The probability distribution over token ids given + context according to the target model. + shape = [batch_size, num_speculative_tokens, vocab_size] + + bonus_token_ids: The "bonus" token ids that are accepted iff all + speculative tokens in a sequence are accepted. + shape = [batch_size, num_bonus_tokens] + + draft_token_ids: The token ids that were sampled from the draft + probabilities. + shape = [batch_size, num_speculative_tokens] + + Returns: + output_token_ids: The token ids sampled via rejection sampling, + or -1 if unable to sample a token because the previous token + was rejected. + shape = [batch_size, num_speculative_tokens + num_bonus_tokens] + """ + # Only perform shape/dtype/device checking in strict mode, as it adds + # overhead. + if self._strict_mode: + self._raise_if_incorrect_input(target_probs, draft_token_ids, + bonus_token_ids) + accepted = self._evaluate_accepted_tokens(target_probs, + draft_token_ids) + recovered_token_ids = self._replacement_token_ids(target_probs) + output_token_ids = self._create_output(accepted, recovered_token_ids, + draft_token_ids, + bonus_token_ids) + return output_token_ids + + def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): + r""" + Evaluates and returns a mask of accepted tokens based on the + posterior probabilities. + + Parameters: + ---------- + target_probs : torch.Tensor + A tensor of shape (batch_size, k, vocab_size) representing + the probabilities of each token in the vocabulary for each + position in the proposed sequence. This is the distribution + generated by the target model. + draft_token_ids : torch.Tensor + A tensor of shape (batch_size, k) representing the proposed + token ids. + + A draft token_id x_{n+k} is accepted if it satisfies the + following condition + + .. math:: + p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > + \min \left( \epsilon, \delta * \exp \left( + -H(p_{\text{original}}( + \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) + + where :math:`p_{\text{original}}` corresponds to target_probs + and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters + specified using self._posterior_threshold and self._posterior_alpha + + This method computes the posterior probabilities for the given + draft token ids based on the provided target probabilities. It + calculates the entropy of the posterior distribution and determines + a dynamic threshold for each token position using the provided + posterior_threshold and posterior_alpha values. The method then + returns a boolean mask indicating which tokens can be accepted. + + Returns: + ------- + torch.Tensor + A boolean tensor of shape (batch_size, k) where each element + indicates whether the corresponding draft token has been accepted + or rejected. True indicates acceptance and false indicates + rejection. + + """ + device = target_probs.device + candidates_prob = torch.gather( + target_probs, dim=-1, + index=draft_token_ids.unsqueeze(-1)).squeeze(-1) + # A small constant added to prevent computing the logarithm of zero, + # which can lead to undefined values. + epsilon = 1e-5 + posterior_entropy = -torch.sum( + target_probs * torch.log(target_probs + epsilon), dim=-1) + threshold = torch.minimum( + torch.ones_like(posterior_entropy, device=device) * + self._posterior_threshold, + torch.exp(-posterior_entropy) * self._posterior_alpha, + ) + accepted_mask = candidates_prob > threshold + return accepted_mask + + def _replacement_token_ids(self, target_probs): + """ + Generate one replacement token ID for each sequence based on target + probabilities. The replacement token is used as the fallback option + if typical acceptance sampling does not accept any draft tokens for + that particular sequence. + + This method computes the token IDs to be replaced by selecting the + token with the highest probability for each sequence in the first + position. The rest of the output is filled with -1. + + Parameters + ---------- + target_probs : torch.Tensor + A tensor of shape (batch_size, k, vocab_size) containing + the target probability distribution + + Returns + ------- + torch.Tensor + A tensor of shape (batch_size, k) with the replacement + token IDs. Only the first column is set, and the rest of the + columns are filled with -1. + """ + max_indices = torch.argmax(target_probs[:, 0, :], dim=1) + output = -torch.ones((target_probs.shape[0], target_probs.shape[1]), + dtype=self.token_id_dtype, + device=target_probs.device) + output[:, 0] = max_indices + return output -- GitLab From daef218b5595a8c744ee143223f4f0544619ea9f Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 18 Jun 2024 10:34:33 +0800 Subject: [PATCH 076/376] [Model] Initialize Phi-3-vision support (#4986) --- docs/source/models/supported_models.rst | 4 + examples/phi3v_example.py | 57 ++++ requirements-test.txt | 1 + tests/conftest.py | 3 + tests/models/test_phi3v.py | 124 ++++++++ vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/phi3v.py | 379 ++++++++++++++++++++++++ vllm/multimodal/utils.py | 2 + 8 files changed, 571 insertions(+) create mode 100644 examples/phi3v_example.py create mode 100644 tests/models/test_phi3v.py create mode 100644 vllm/model_executor/models/phi3v.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 5d3f55be1..f4673dc27 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -135,6 +135,10 @@ Alongside each architecture, we include some popular models that use it. - Phi-3-Small - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. - + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision + - :code:`microsoft/Phi-3-vision-128k-instruct`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py new file mode 100644 index 000000000..d5e60ae1e --- /dev/null +++ b/examples/phi3v_example.py @@ -0,0 +1,57 @@ +import os +import subprocess + +from PIL import Image + +from vllm import LLM, SamplingParams +from vllm.multimodal.image import ImagePixelData + + +def run_phi3v(): + model_path = "microsoft/Phi-3-vision-128k-instruct" + llm = LLM( + model=model_path, + trust_remote_code=True, + max_model_len=4096, + image_input_type="pixel_values", + image_token_id=32044, + image_input_shape="1,3,1008,1344", + image_feature_size=1921, + disable_image_processor=False, + ) + + image = Image.open("images/cherry_blossom.jpg") + + # single-image prompt + prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 + prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "") + + sampling_params = SamplingParams(temperature=0, max_tokens=64) + + outputs = llm.generate({ + "prompt": prompt, + "sampling_params": sampling_params, + "multi_modal_data": ImagePixelData(image), + }) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" + local_directory = "images" + + # Make sure the local directory exists or create it + os.makedirs(local_directory, exist_ok=True) + + # Use AWS CLI to sync the directory, assume anonymous access + subprocess.check_call([ + "aws", + "s3", + "sync", + s3_bucket_path, + local_directory, + "--no-sign-request", + ]) + run_phi3v() diff --git a/requirements-test.txt b/requirements-test.txt index 8b68e0e93..fef0ede7b 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -14,6 +14,7 @@ peft requests ray sentence-transformers # required for embedding +torchvision # required for the image processor of phi3v # Benchmarking aiohttp diff --git a/tests/conftest.py b/tests/conftest.py index 18aea3702..f37c9883f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -144,6 +144,7 @@ class HfRunner: model_name: str, dtype: str = "half", *, + model_kwargs: Optional[Dict[str, Any]] = None, is_embedding_model: bool = False, is_vision_model: bool = False, ) -> None: @@ -166,11 +167,13 @@ class HfRunner: else: auto_cls = AutoModelForCausalLM + model_kwargs = model_kwargs if model_kwargs is not None else {} self.model = self.wrap_device( auto_cls.from_pretrained( model_name, torch_dtype=torch_dtype, trust_remote_code=True, + **model_kwargs, )) self.tokenizer = AutoTokenizer.from_pretrained( diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py new file mode 100644 index 000000000..607ad95e8 --- /dev/null +++ b/tests/models/test_phi3v.py @@ -0,0 +1,124 @@ +from typing import List, Tuple + +import pytest +from transformers import AutoTokenizer + +from vllm.config import VisionLanguageConfig +from vllm.utils import is_cpu + +from ..conftest import IMAGE_FILES + +pytestmark = pytest.mark.llava + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = [ + "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 + "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", +] + +assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + + +def iter_phi3v_configs(model_name: str): + image_hw_to_feature_size = { + (1008, 1344): 1921, + } + + for (h, w), f in image_hw_to_feature_size.items(): + for input_type, input_shape in [ + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), + ]: + yield (model_name, + VisionLanguageConfig(image_input_type=input_type, + image_feature_size=f, + image_token_id=32044, + image_input_shape=input_shape, + image_processor=model_name, + image_processor_revision=None)) + + +model_and_vl_config = [ + *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"), +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str], + vlm_config: VisionLanguageConfig, model_id: str): + """Sanitize vllm output to be comparable with hf output. + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... + It also reduces `output_str` from "bla" to "bla". + """ + input_ids, output_str = vllm_output + image_token_id = vlm_config.image_token_id + + tokenizer = AutoTokenizer.from_pretrained(model_id) + image_token_str = tokenizer.decode(image_token_id) + + hf_input_ids = [ + input_id if input_id != image_token_id else 0 + for idx, input_id in enumerate(input_ids) + ] + hf_output_str = output_str \ + .replace(image_token_str * vlm_config.image_feature_size, "") \ + .replace("", " ").replace("<|user|>", "") \ + .replace("<|end|>\n<|assistant|>", " ") + + return hf_input_ids, hf_output_str + + +target_dtype = "half" +if is_cpu(): + target_dtype = "bfloat16" + + +# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] +# Since we use _attn_implementation="eager" for hf_runner, here is +# numeric difference for longer context and test can't pass +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [8]) +def test_models(hf_runner, vllm_runner, hf_images, vllm_images, + model_and_config, dtype: str, max_tokens: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + model_id, vlm_config = model_and_config + + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model_id, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images) + + vllm_image_prompts = [ + p.replace("<|image_1|>", + "<|image|>" * vlm_config.image_feature_size + "") + for p in HF_IMAGE_PROMPTS + ] + + with vllm_runner(model_id, + max_model_len=2048, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) + + for i in range(len(HF_IMAGE_PROMPTS)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_to_hf_output( + vllm_outputs[i], vlm_config, model_id) + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index bed6f518c..f9ec72096 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -49,6 +49,7 @@ _GENERATION_MODELS = { "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), + "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py new file mode 100644 index 000000000..e8f190d3f --- /dev/null +++ b/vllm/model_executor/models/phi3v.py @@ -0,0 +1,379 @@ +# coding=utf-8 +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +import torch.nn as nn +from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig +from transformers.utils import logging + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.llama import LlamaModel +from vllm.model_executor.models.vlm_base import VisionLanguageModelBase +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import get_dummy_image_data +from vllm.sequence import SamplerOutput + +logger = logging.get_logger(__name__) + +_KEYS_TO_MODIFY_MAPPING = { + "model.vision_embed_tokens": "vision_embed_tokens", +} + +CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, + hidden_act="quick_gelu", + hidden_size=1024, + image_size=336, + intermediate_size=4096, + num_attention_heads=16, + num_channels=3, + num_hidden_layers=24, + patch_size=14, + projection_dim=768) + + +class Phi3ImageEmbeddingBase(nn.Module): + + def __init__(self, wte=None) -> None: + super().__init__() + self.wte = wte + self.layer_idx: int + self.type_feature: str + self.img_processor: CLIPVisionModel + + def set_img_features(self, img_features: torch.FloatTensor) -> None: + self.img_features = img_features + + def set_img_sizes(self, img_sizes: torch.LongTensor) -> None: + self.img_sizes = img_sizes + + def get_img_features(self, + img_embeds: torch.FloatTensor) -> torch.FloatTensor: + LAYER_IDX = self.layer_idx + TYPE_FEATURE = self.type_feature + + img_processor_output = self.img_processor(img_embeds, + output_hidden_states=True) + img_feature = img_processor_output.hidden_states[LAYER_IDX] + + if TYPE_FEATURE == "patch": + patch_feature = img_feature[:, 1:] + return patch_feature + + if TYPE_FEATURE == "cls_patch": + return img_feature + + raise NotImplementedError + + +# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py +class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): + """Phi3 Image embedding with HD transform.""" + + def __init__(self, + vision_language_config: VisionLanguageConfig, + config: PretrainedConfig, + wte=None) -> None: + super().__init__(wte) + + self.image_token_id = vision_language_config.image_token_id + # n_embed or hidden_size + hidden_size = config.n_embd if hasattr( + config, 'n_embd') else config.hidden_size + + clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG + self.img_processor = CLIPVisionModel(clip_config) + image_dim_out = config.img_processor['image_dim_out'] + self.num_img_tokens = config.img_processor['num_img_tokens'] + + self.image_dim_out = image_dim_out + self.img_sizes = None + + # global_gn and sub_gn for hd transform, serves as line separator + self.use_hd_transform = config.embd_layer.get('use_hd_transform', + False) + self.with_learnable_separator = config.embd_layer.get( + 'with_learnable_separator', False) + self.hd_transform_order = config.embd_layer.get( + 'hd_transform_order', 'glb_sub') + # with_hd_transform and with_learnable_separator should have same value + assert self.use_hd_transform and self.with_learnable_separator + + # 1024 * 4, merge spatial to channel dimension + self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4])) + self.sub_GN = nn.Parameter( + torch.empty([1, 1, 1, self.image_dim_out * 4])) + + dim_projection = hidden_size + depth = 2 + layers = [nn.Linear(image_dim_out * 4, dim_projection)] + for _ in range(1, depth): + layers.extend( + [nn.GELU(), + nn.Linear(dim_projection, dim_projection)]) + self.img_projection = nn.Sequential(*layers) + + self.vocab_size = config.vocab_size + self.img_features = None + + self.layer_idx = config.img_processor.get('layer_idx', -2) + self.type_feature = config.img_processor.get('type_feature', 'patch') + + def forward(self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + image_sizes=None) -> torch.FloatTensor: + """process and merge text embeddings with image embeddings.""" + + img_embeds = pixel_values + img_sizes = image_sizes + + if self.img_features is not None: + img_embeds = self.img_features.clone() + self.img_features = None + + if self.img_sizes is not None: + img_sizes = self.img_sizes + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + positions = torch.nonzero(input_ids == self.image_token_id) + + select = False + + target_device = self.img_projection[0].bias.device + target_dtype = self.img_projection[0].bias.dtype + + if len(positions.tolist()) > 0: + # if self.use_hd_transform and img_sizes: + # img_embeds: (num_images, max_num_crops, 3, H, W) + # img_sizes: (num_images, 2).view(1, -1) + + bs = img_embeds.shape[0] + # Nx(HW)xC + img_features = self.get_img_features(img_embeds.flatten(0, 1)) + base_feat_height = base_feat_width = int( + img_features.shape[1]**0.5) + + # bs x max_num_crops x (24x24) x C + img_features = img_features.view( + bs, -1, base_feat_height * base_feat_width, self.image_dim_out) + C = self.image_dim_out + H = base_feat_height + + output_imgs = [] + output_len = [] + + if isinstance(img_sizes, torch.Tensor): + img_sizes.squeeze_(0) + + for _bs in range(bs): + h, w = img_sizes + h = h // 336 + w = w // 336 + B_ = h * w + + # 1 x (24x24) x 1024 + global_img_feature = img_features[_bs, :1] + + # 1 x 12 x 12 x 4096 + glb_img = global_img_feature \ + .reshape(1, H // 2, 2, H // 2, 2,C) \ + .permute(0, 1, 3, 2, 4, 5) \ + .reshape(1, H // 2, H // 2, 4 * C) + temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1) + + # 1 x 156 x 4096 + glb_img = torch.cat([glb_img, temp_glb_GN], + dim=2).reshape(1, -1, 4 * C) + + # (max_num_crops-1) x (12x12) x C + sub_img = img_features[_bs, 1:] + # 16x574x1024 + # get rid of padding sub_img + sub_img = sub_img[:B_] + + sub_img = sub_img.reshape(B_, H // 2, 2, H // 2, 2, C) \ + .permute(0, 1, 3, 2, 4, 5).reshape(B_, -1, 4 * C) + sub_img = sub_img.reshape(1, h, w, 12, 12, -1) \ + .permute(0, 1, 3, 2, 4, 5) \ + .reshape(1, h * 12, w * 12, 4 * C) + temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1) + sub_img = torch.cat([sub_img, temp_sub_GN], + dim=2).reshape(1, -1, 4 * C) + # (1, num_img_tokens, 1024*4) + + # glb + sub + if self.hd_transform_order == 'glb_sub': + output_imgs.append( + torch.cat([glb_img, self.glb_GN, sub_img], dim=1)) + elif self.hd_transform_order == 'sub_glb': + output_imgs.append( + torch.cat([sub_img, self.glb_GN, glb_img], dim=1)) + + temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12) + output_len.append(temp_len) + + num_img_tokens = output_len + img_set_tensor = [] + for _output_img in output_imgs: + img_feature_proj = self.img_projection( + _output_img.to(target_device, target_dtype)) + img_set_tensor.append(img_feature_proj) + select = True + + input_ids.clamp_min_(0).clamp_max_(self.vocab_size) + + hidden_states = self.wte(input_ids) + + if select: + idx = 0 + for i, cnt in enumerate(num_img_tokens): + hidden_states[positions[idx, 0], + positions[idx, 1]:positions[idx, 1] + + cnt] = (img_set_tensor[i].to( + hidden_states.device, hidden_states.dtype)) + idx += cnt + + return hidden_states.squeeze(0) + + +class Phi3VImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, 1 + num_patches, num_channels, height, width)""" + + image_sizes: torch.Tensor + """Shape: (batch_size, 2)""" + + +@MULTIMODAL_REGISTRY.register_image_pixel_input() +@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) +class Phi3VForCausalLM(VisionLanguageModelBase): + + def __init__(self, + config: PretrainedConfig, + vision_language_config: VisionLanguageConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__(vision_language_config) + self.config = config + self.model = LlamaModel(config, cache_config, quant_config) + self.vision_embed_tokens = Phi3HDImageEmbedding( + vision_language_config, config, self.model.embed_tokens) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_sizes = kwargs.pop("image_sizes", None) + + expected_input_type = self.vision_language_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if expected_input_type != ImageInputType.PIXEL_VALUES: + raise ValueError( + f"Unexpected image input type: {expected_input_type}." + "Phi3v only support pixel_values input currently.") + + if pixel_values is not None and image_sizes is not None: + return Phi3VImagePixelInputs(type="pixel_values", + data=pixel_values, + image_sizes=image_sizes) + + return None + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, **kwargs: object): + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + inputs_embeds = self.vision_embed_tokens( + input_ids, image_input["data"], image_input["image_sizes"]) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # We only do sharding for language model + # and not vision model for now. + if "vision_embed_tokens" in name and self.vision_embed_tokens: + continue + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c6311d60e..509f791d2 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -79,6 +79,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str, if config.hf_config.model_type in ("llava", "llava_next"): full_prompt = f"{image_prompt}\n{text_prompt}" + elif config.hf_config.model_type == 'phi3_v': + full_prompt = f"{image_prompt}\n{text_prompt}" else: raise ValueError( f"Unsupported model type: {config.hf_config.model_type}") -- GitLab From 5002175e801703c5b8a1411b490f6ff6c1747c8e Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 17 Jun 2024 21:54:11 -0600 Subject: [PATCH 077/376] [Kernel] Add punica dimensions for Granite 13b (#5559) Signed-off-by: Joe Runde --- csrc/punica/bgmv/bgmv_config.h | 8 ++++++++ tests/lora/test_punica.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 4b376261d..0456b4bc2 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -20,6 +20,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 1152) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1536) \ + f(in_T, out_T, W_T, narrow, 1664) \ f(in_T, out_T, W_T, narrow, 1728) \ f(in_T, out_T, W_T, narrow, 1792) \ f(in_T, out_T, W_T, narrow, 2048) \ @@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ + f(in_T, out_T, W_T, narrow, 5888) \ f(in_T, out_T, W_T, narrow, 6144) \ f(in_T, out_T, W_T, narrow, 6400) \ f(in_T, out_T, W_T, narrow, 6848) \ @@ -45,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 9216) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ + f(in_T, out_T, W_T, narrow, 11264) \ f(in_T, out_T, W_T, narrow, 12288) \ f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ @@ -53,6 +56,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ + f(in_T, out_T, W_T, narrow, 22528) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 27648) \ @@ -91,6 +95,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 1152, narrow) \ f(in_T, out_T, W_T, 1280, narrow) \ f(in_T, out_T, W_T, 1536, narrow) \ + f(in_T, out_T, W_T, 1664, narrow) \ f(in_T, out_T, W_T, 1728, narrow) \ f(in_T, out_T, W_T, 1792, narrow) \ f(in_T, out_T, W_T, 2048, narrow) \ @@ -107,6 +112,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 5120, narrow) \ f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ + f(in_T, out_T, W_T, 5888, narrow) \ f(in_T, out_T, W_T, 6144, narrow) \ f(in_T, out_T, W_T, 6400, narrow) \ f(in_T, out_T, W_T, 6848, narrow) \ @@ -116,6 +122,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 9216, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ + f(in_T, out_T, W_T, 11264, narrow) \ f(in_T, out_T, W_T, 12288, narrow) \ f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ @@ -124,6 +131,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 16384, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ + f(in_T, out_T, W_T, 22528, narrow) \ f(in_T, out_T, W_T, 24576, narrow) \ f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 27648, narrow) \ diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index f021c003b..d87658e5d 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -53,6 +53,7 @@ H1 = H2 = [ 1152, 1280, 1536, + 1664, 2048, 2304, 2560, @@ -66,6 +67,7 @@ H1 = H2 = [ 5120, 5504, 5632, + 5888, 6144, 6400, 6848, @@ -75,10 +77,12 @@ H1 = H2 = [ 9216, 10240, 11008, + 11264, 13824, 14336, 15360, 22016, + 22528, 24576, 27392, 27648, -- GitLab From 8eadcf0b90f126cf9b23f9583a53b19b6b58fd87 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 17 Jun 2024 20:54:57 -0700 Subject: [PATCH 078/376] [misc][typo] fix typo (#5620) --- vllm/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/block.py b/vllm/block.py index e7fb29c8c..bd00c07ad 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -13,7 +13,7 @@ TokensBlock = List[int] class BlockPool: - """A pool of physical blocks. + """A pool of logical blocks. When requests come, we create a lot of logical blocks; when requests are done, we destroy a lot of logical blocks. It turns out that creating and destroying logical blocks can be expensive, -- GitLab From 32c86e494a49dff8d1d4b10c5922a36daa6e8faf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 18 Jun 2024 11:58:30 +0800 Subject: [PATCH 079/376] [Misc] Fix typo (#5618) --- .../scripts/convert-results-json-to-markdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 9aa8162d1..e1002213f 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -21,7 +21,7 @@ latency_column_mapping = { "P99": "P99", } -# thoughput tests and the keys that will be printed into markdown +# throughput tests and the keys that will be printed into markdown throughput_results = [] throughput_results_column_mapping = { "test_name": "Test name", -- GitLab From 114d7270ffc2e5a66e0974b0d6d913c7f990afa7 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 17 Jun 2024 21:37:18 -0700 Subject: [PATCH 080/376] [CI] Avoid naming different metrics with the same name in performance benchmark (#5615) --- .../convert-results-json-to-markdown.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index e1002213f..534ecf179 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -15,10 +15,10 @@ latency_column_mapping = { "avg_latency": "Mean latency (ms)", # "P10": "P10 (s)", # "P25": "P25 (s)", - "P50": "Median", + "P50": "Median latency (ms)", # "P75": "P75 (s)", # "P90": "P90 (s)", - "P99": "P99", + "P99": "P99 latency (ms)", } # throughput tests and the keys that will be printed into markdown @@ -43,15 +43,14 @@ serving_column_mapping = { # "input_throughput": "Input Tput (tok/s)", # "output_throughput": "Output Tput (tok/s)", "mean_ttft_ms": "Mean TTFT (ms)", - # do not say TTFT again to avoid the table getting too wide - "median_ttft_ms": "Median", - "p99_ttft_ms": "P99", + "median_ttft_ms": "Median TTFT (ms)", + "p99_ttft_ms": "P99 TTFT (ms)", # "mean_tpot_ms": "Mean TPOT (ms)", # "median_tpot_ms": "Median", # "p99_tpot_ms": "P99", "mean_itl_ms": "Mean ITL (ms)", - "median_itl_ms": "Median", - "p99_itl_ms": "P99", + "median_itl_ms": "Median ITL (ms)", + "p99_itl_ms": "P99 ITL (ms)", } @@ -183,3 +182,11 @@ if __name__ == "__main__": serving_tests_markdown_table=serving_md_table, benchmarking_results_in_json_string=processed_results_json) f.write(results) + + # document benchmarking results in json + with open(results_folder / "benchmark_results.json", "w") as f: + + results = latency_results.to_dict( + orient='records') + throughput_results.to_dict( + orient='records') + serving_results.to_dict(orient='records') + f.write(json.dumps(results)) -- GitLab From db5ec52ad7dc69dbe8dd9ba25fe8f2c6ce35a4cf Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 18 Jun 2024 00:21:05 -0700 Subject: [PATCH 081/376] [bugfix][distributed] improve p2p capability test (#5612) [bugfix][distributed] do not error if two processes do not agree on p2p capability (#5612) --- .../device_communicators/custom_all_reduce_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 75b7c374c..e0641a54c 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -71,6 +71,7 @@ def consumer(batch_tgt: Sequence[int], if open_success: # modify the memory lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() # use two queues to simulate barrier producer_queue.get() consumer_queue.put(0) @@ -142,8 +143,13 @@ def can_actually_p2p( for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() - assert a == b - result.append(a) + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", src, tgt) + result.append(False) + else: + result.append(a) return result -- GitLab From f0cc0e68e3ceef6fe43f78bf36df88e6cad28766 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Tue, 18 Jun 2024 05:12:19 -0700 Subject: [PATCH 082/376] [Misc] Remove import from transformers logging (#5625) --- vllm/model_executor/models/phi3v.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index e8f190d3f..35f3b894f 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -18,7 +18,6 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig -from transformers.utils import logging from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig @@ -35,8 +34,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import get_dummy_image_data from vllm.sequence import SamplerOutput -logger = logging.get_logger(__name__) - _KEYS_TO_MODIFY_MAPPING = { "model.vision_embed_tokens": "vision_embed_tokens", } -- GitLab From 4ad7b53e59b6600d050581329dfaba0222b13ae5 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 18 Jun 2024 06:10:04 -0700 Subject: [PATCH 083/376] [CI/Build][Misc] Update Pytest Marker for VLMs (#5623) --- .buildkite/run-cpu-test.sh | 2 +- .buildkite/test-pipeline.yaml | 6 +++--- pyproject.toml | 2 +- tests/models/test_llava.py | 2 +- tests/models/test_llava_next.py | 2 +- tests/models/test_phi3v.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 532d6ad88..f4fa24be1 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,4 +23,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf cd ../ - pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6439a315e..c1e433ec4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -100,13 +100,13 @@ steps: - label: Models Test #mirror_hardwares: [amd] commands: - - pytest -v -s models -m \"not llava\" + - pytest -v -s models -m \"not vlm\" -- label: Llava Test +- label: Vision Language Models Test mirror_hardwares: [amd] commands: - bash ../.buildkite/download-images.sh - - pytest -v -s models -m llava + - pytest -v -s models -m vlm - label: Prefix Caching Test mirror_hardwares: [amd] diff --git a/pyproject.toml b/pyproject.toml index eb691c297..4958aae02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,5 +71,5 @@ markers = [ "skip_global_cleanup", "llm: run tests for vLLM API only", "openai: run tests for OpenAI API only", - "llava: run tests for LLaVA models only", + "vlm: run tests for vision language models only", ] diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index a1f0cff1c..b41c69f72 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -7,7 +7,7 @@ from vllm.config import VisionLanguageConfig from ..conftest import IMAGE_FILES -pytestmark = pytest.mark.llava +pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass HF_IMAGE_PROMPTS = [ diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index aa6ee268a..0eca5cb53 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -7,7 +7,7 @@ from vllm.config import VisionLanguageConfig from ..conftest import IMAGE_FILES -pytestmark = pytest.mark.llava +pytestmark = pytest.mark.vlm _PREFACE = ( "A chat between a curious human and an artificial intelligence assistant. " diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 607ad95e8..1732e8f08 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -8,7 +8,7 @@ from vllm.utils import is_cpu from ..conftest import IMAGE_FILES -pytestmark = pytest.mark.llava +pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass HF_IMAGE_PROMPTS = [ -- GitLab From 13db4369d9ab3158a01192d60c744c6523961824 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 18 Jun 2024 07:26:20 -0700 Subject: [PATCH 084/376] [ci] Deprecate original CI template (#5624) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 2 +- .buildkite/test-template.j2 | 101 ---------------------------------- 2 files changed, 1 insertion(+), 102 deletions(-) delete mode 100644 .buildkite/test-template.j2 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c1e433ec4..a81885b8a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,6 +1,6 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template.j2` to generate +# This script will be feed into Jinja template in `test-template-aws.j2` to generate # the final pipeline yaml file. steps: diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 deleted file mode 100644 index 3bd1e90c2..000000000 --- a/.buildkite/test-template.j2 +++ /dev/null @@ -1,101 +0,0 @@ -{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} -{% set default_num_gpu = 1 %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel Test" - depends_on: ~ - agents: - queue: intel - command: bash .buildkite/run-cpu-test.sh - - - label: "XPU Test" - agents: - queue: intel - command: bash .buildkite/run-xpu-test.sh - - {% for step in steps %} - - label: "{{ step.label }}" - agents: - queue: kubernetes - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - kubernetes: - podSpec: - {% if step.num_gpus %} - priorityClassName: gpu-priority-cls-{{ step.num_gpus }} - {% endif %} - volumes: - - name: dshm - emptyDir: - medium: Memory - containers: - - image: "{{ docker_image }}" - command: ["bash"] - args: - - '-c' - - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" - {% if not step.no_gpu %} - resources: - requests: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - limits: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - {% endif %} - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - volumeMounts: - - mountPath: /dev/shm - name: dshm - {% endfor %} -- GitLab From 7879f24dcce75665d83865ee8281f2ef1bbb7e74 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Tue, 18 Jun 2024 19:17:03 +0300 Subject: [PATCH 085/376] [Misc] Add OpenTelemetry support (#4687) This PR adds basic support for OpenTelemetry distributed tracing. It includes changes to enable tracing functionality and improve monitoring capabilities. I've also added a markdown with print-screens to guide users how to use this feature. You can find it here --- .buildkite/test-pipeline.yaml | 9 ++ benchmarks/benchmark_latency.py | 48 +++++--- examples/production_monitoring/Otel.md | 82 +++++++++++++ .../production_monitoring/dummy_client.py | 35 ++++++ tests/tracing/__init__.py | 0 tests/tracing/test_tracing.py | 116 ++++++++++++++++++ vllm/config.py | 13 ++ vllm/engine/arg_utils.py | 40 ++++-- vllm/engine/async_llm_engine.py | 22 ++++ vllm/engine/llm_engine.py | 102 +++++++++++++-- vllm/entrypoints/openai/serving_chat.py | 11 ++ vllm/entrypoints/openai/serving_completion.py | 11 ++ vllm/sequence.py | 3 + vllm/tracing.py | 104 ++++++++++++++++ vllm/utils.py | 12 ++ 15 files changed, 567 insertions(+), 41 deletions(-) create mode 100644 examples/production_monitoring/Otel.md create mode 100644 examples/production_monitoring/dummy_client.py create mode 100644 tests/tracing/__init__.py create mode 100644 tests/tracing/test_tracing.py create mode 100644 vllm/tracing.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a81885b8a..5afe37302 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -159,6 +159,15 @@ steps: #mirror_hardwares: [amd] command: pytest -v -s quantization +- label: Tracing Test + commands: + - "pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai" + - pytest -v -s tracing + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 767afd21a..98e0be277 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -20,26 +20,29 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend) + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend, + otlp_traces_endpoint=args.otlp_traces_endpoint, + ) sampling_params = SamplingParams( n=args.n, @@ -254,5 +257,10 @@ if __name__ == '__main__': help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') args = parser.parse_args() main(args) diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md new file mode 100644 index 000000000..144944227 --- /dev/null +++ b/examples/production_monitoring/Otel.md @@ -0,0 +1,82 @@ +# Setup OpenTelemetry POC + +1. Install OpenTelemetry packages: + ``` + pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai + ``` + +1. Start Jaeger in a docker container: + ``` + # From: https://www.jaegertracing.io/docs/1.57/getting-started/ + docker run --rm --name jaeger \ + -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \ + -p 6831:6831/udp \ + -p 6832:6832/udp \ + -p 5778:5778 \ + -p 16686:16686 \ + -p 4317:4317 \ + -p 4318:4318 \ + -p 14250:14250 \ + -p 14268:14268 \ + -p 14269:14269 \ + -p 9411:9411 \ + jaegertracing/all-in-one:1.57 + ``` + +1. In a new shell, export Jaeger IP: + ``` + export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 + ``` + Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM: + ``` + export OTEL_SERVICE_NAME="vllm-server" + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" + ``` + +1. In a new shell, send requests with trace context from a dummy client + ``` + export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + export OTEL_SERVICE_NAME="client-service" + python dummy_client.py + ``` + +1. Open Jaeger webui: http://localhost:16686/ + + In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request. + ![Traces](https://i.imgur.com/GYHhFjo.png) + +1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request. +![Spans details](https://i.imgur.com/OPf6CBL.png) + +## Exporter Protocol +OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter. +By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows: +``` +export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces +python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" +``` + +## Instrumentation of FastAPI +OpenTelemetry allows automatic instrumentation of FastAPI. +1. Install the instrumentation library + ``` + pip install opentelemetry-instrumentation-fastapi + ``` + +1. Run vLLM with `opentelemetry-instrument` + ``` + opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" + ``` + +1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI. + +![FastAPI Spans](https://i.imgur.com/hywvoOJ.png) \ No newline at end of file diff --git a/examples/production_monitoring/dummy_client.py b/examples/production_monitoring/dummy_client.py new file mode 100644 index 000000000..b1a2b3c3c --- /dev/null +++ b/examples/production_monitoring/dummy_client.py @@ -0,0 +1,35 @@ +import requests +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import (BatchSpanProcessor, + ConsoleSpanExporter) +from opentelemetry.trace import SpanKind, set_tracer_provider +from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + +trace_provider = TracerProvider() +set_tracer_provider(trace_provider) + +trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + +tracer = trace_provider.get_tracer("dummy-client") + +url = "http://localhost:8000/v1/completions" +with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span: + prompt = "San Francisco is a" + span.set_attribute("prompt", prompt) + headers = {} + TraceContextTextMapPropagator().inject(headers) + payload = { + "model": "facebook/opt-125m", + "prompt": prompt, + "max_tokens": 10, + "best_of": 20, + "n": 3, + "use_beam_search": "true", + "temperature": 0.0, + # "stream": True, + } + response = requests.post(url, headers=headers, json=payload) diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py new file mode 100644 index 000000000..2f8f62cf2 --- /dev/null +++ b/tests/tracing/test_tracing.py @@ -0,0 +1,116 @@ +import os +import threading +from concurrent import futures +from typing import Callable, Dict, Iterable, Literal + +import grpc +import pytest +from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( + ExportTraceServiceResponse) +from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( + TraceServiceServicer, add_TraceServiceServicer_to_server) +from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue +from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_INSECURE) + +from vllm import LLM, SamplingParams +from vllm.tracing import SpanAttributes + +FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" + +FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', + 'array_value'] + + +def decode_value(value: AnyValue): + field_decoders: Dict[FieldName, Callable] = { + "bool_value": (lambda v: v.bool_value), + "string_value": (lambda v: v.string_value), + "int_value": (lambda v: v.int_value), + "double_value": (lambda v: v.double_value), + "array_value": + (lambda v: [decode_value(item) for item in v.array_value.values]), + } + for field, decoder in field_decoders.items(): + if value.HasField(field): + return decoder(value) + raise ValueError(f"Couldn't decode value: {value}") + + +def decode_attributes(attributes: Iterable[KeyValue]): + return {kv.key: decode_value(kv.value) for kv in attributes} + + +class FakeTraceService(TraceServiceServicer): + + def __init__(self): + self.request = None + self.evt = threading.Event() + + def Export(self, request, context): + self.request = request + self.evt.set() + return ExportTraceServiceResponse() + + +@pytest.fixture +def trace_service(): + """Fixture to set up a fake gRPC trace service""" + server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + service = FakeTraceService() + add_TraceServiceServicer_to_server(service, server) + server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS) + server.start() + + yield service + + server.stop(None) + + +def test_traces(trace_service): + os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" + + sampling_params = SamplingParams(temperature=0.01, + top_p=0.1, + max_tokens=256) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) + + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") + + attributes = decode_attributes(trace_service.request.resource_spans[0]. + scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + assert attributes.get( + SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + assert attributes.get( + SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get( + SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of + assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time diff --git a/vllm/config.py b/vllm/config.py index d95faf52d..5de00d7d3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -10,6 +10,7 @@ from transformers import PretrainedConfig, PreTrainedTokenizerBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry +from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu, is_xpu) @@ -1371,6 +1372,17 @@ class DecodingConfig: f"must be one of {valid_guided_backends}") +@dataclass +class ObservabilityConfig: + """Configuration for observability.""" + otlp_traces_endpoint: Optional[str] = None + + def __post_init__(self): + if not is_otel_installed() and self.otlp_traces_endpoint is not None: + raise ValueError("OpenTelemetry packages must be installed before " + "configuring 'otlp_traces_endpoint'") + + @dataclass(frozen=True) class EngineConfig: """Dataclass which contains all engine-related configuration. This @@ -1387,6 +1399,7 @@ class EngineConfig: vision_language_config: Optional[VisionLanguageConfig] speculative_config: Optional[SpeculativeConfig] decoding_config: Optional[DecodingConfig] + observability_config: Optional[ObservabilityConfig] def __post_init__(self): """Verify configs are valid & consistent with each other. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9d04f1dc5..647793a6d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -7,8 +7,9 @@ from typing import List, Optional, Tuple, Union from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - TokenizerPoolConfig, VisionLanguageConfig) + ObservabilityConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, TokenizerPoolConfig, + VisionLanguageConfig) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import str_to_int_tuple @@ -101,6 +102,8 @@ class EngineArgs: qlora_adapter_name_or_path: Optional[str] = None + otlp_traces_endpoint: Optional[str] = None + def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @@ -599,6 +602,13 @@ class EngineArgs: type=str, default=None, help='Name or path of the QLoRA adapter.') + + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') + return parser @classmethod @@ -757,6 +767,9 @@ class EngineArgs: decoding_config = DecodingConfig( guided_decoding_backend=self.guided_decoding_backend) + observability_config = ObservabilityConfig( + otlp_traces_endpoint=self.otlp_traces_endpoint) + if (model_config.get_sliding_window() is not None and scheduler_config.chunked_prefill_enabled and not scheduler_config.use_v2_block_manager): @@ -764,16 +777,19 @@ class EngineArgs: "Chunked prefill is not supported with sliding window. " "Set --disable-sliding-window to disable sliding window.") - return EngineConfig(model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - lora_config=lora_config, - vision_language_config=vision_language_config, - speculative_config=speculative_config, - load_config=load_config, - decoding_config=decoding_config) + return EngineConfig( + model_config=model_config, + cache_config=cache_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + lora_config=lora_config, + vision_language_config=vision_language_config, + speculative_config=speculative_config, + load_config=load_config, + decoding_config=decoding_config, + observability_config=observability_config, + ) @dataclass diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ab312850b..86720e4fb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -244,6 +244,9 @@ class _AsyncLLMEngine(LLMEngine): # Log stats. self.do_log_stats(scheduler_outputs, output) + # Tracing + self.do_tracing(scheduler_outputs) + if not request_outputs: # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in @@ -285,6 +288,7 @@ class _AsyncLLMEngine(LLMEngine): params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -301,6 +305,7 @@ class _AsyncLLMEngine(LLMEngine): params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) async def check_health_async(self) -> None: @@ -556,6 +561,7 @@ class AsyncLLMEngine: params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncStream: if self.log_requests: if isinstance(inputs, str): @@ -597,6 +603,7 @@ class AsyncLLMEngine: params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) return stream @@ -607,6 +614,7 @@ class AsyncLLMEngine: sampling_params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. @@ -621,6 +629,7 @@ class AsyncLLMEngine: sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. + trace_headers: OpenTelemetry trace headers. Yields: The output `RequestOutput` objects from the LLMEngine @@ -674,6 +683,7 @@ class AsyncLLMEngine: inputs, sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ): yield LLMEngine.validate_output(output, RequestOutput) @@ -683,6 +693,7 @@ class AsyncLLMEngine: pooling_params: PoolingParams, request_id: str, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[EmbeddingRequestOutput]: """Generate outputs for a request from an embedding model. @@ -697,6 +708,7 @@ class AsyncLLMEngine: pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. + trace_headers: OpenTelemetry trace headers. Yields: The output `EmbeddingRequestOutput` objects from the LLMEngine @@ -748,6 +760,7 @@ class AsyncLLMEngine: inputs, pooling_params, lora_request=lora_request, + trace_headers=trace_headers, ): yield LLMEngine.validate_output(output, EmbeddingRequestOutput) @@ -758,6 +771,7 @@ class AsyncLLMEngine: params: Union[SamplingParams, PoolingParams], *, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]: """Common logic to process requests with SamplingParams or PoolingParams.""" @@ -769,6 +783,7 @@ class AsyncLLMEngine: params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) try: @@ -848,3 +863,10 @@ class AsyncLLMEngine: else: await self.engine.check_health_async() logger.debug("Health check took %fs", time.perf_counter() - t) + + async def is_tracing_enabled(self) -> bool: + if self.engine_use_ray: + return await self.engine.is_tracing_enabled.remote( # type: ignore + ) + else: + return self.engine.is_tracing_enabled() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eed9a17e4..75d417f52 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,14 +1,14 @@ import time from contextlib import contextmanager -from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional +from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional from typing import Sequence as GenericSequence from typing import Set, Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig, - LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, SpeculativeConfig, + LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, SchedulerOutputs) @@ -31,6 +31,8 @@ from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, PoolerOutput, SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata, SequenceStatus) +from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, + init_tracer) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -154,6 +156,7 @@ class LLMEngine: vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], decoding_config: Optional[DecodingConfig], + observability_config: Optional[ObservabilityConfig], executor_class: Type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, @@ -168,7 +171,8 @@ class LLMEngine: "disable_custom_all_reduce=%s, quantization=%s, " "enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, seed=%d, served_model_name=%s)", + "decoding_config=%r, observability_config=%r, " + "seed=%d, served_model_name=%s)", VLLM_VERSION, model_config.model, speculative_config, @@ -192,6 +196,7 @@ class LLMEngine: model_config.quantization_param_path, device_config.device, decoding_config, + observability_config, model_config.seed, model_config.served_model_name, ) @@ -207,6 +212,8 @@ class LLMEngine: self.speculative_config = speculative_config self.load_config = load_config self.decoding_config = decoding_config or DecodingConfig() + self.observability_config = observability_config or ObservabilityConfig( + ) self.log_stats = log_stats if not self.model_config.skip_tokenizer_init: @@ -288,6 +295,12 @@ class LLMEngine: max_model_len=self.model_config.max_model_len) self.stat_logger.info("cache_config", self.cache_config) + self.tracer = None + if self.observability_config.otlp_traces_endpoint: + self.tracer = init_tracer( + "vllm.llm_engine", + self.observability_config.otlp_traces_endpoint) + # Create sequence output processor, e.g. for beam search or # speculative decoding. self.output_processor = ( @@ -444,6 +457,7 @@ class LLMEngine: params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], + trace_headers: Optional[Dict[str, str]] = None, ) -> None: # Create the sequences. block_size = self.cache_config.block_size @@ -461,6 +475,7 @@ class LLMEngine: params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) elif isinstance(params, PoolingParams): seq_group = self._create_sequence_group_with_pooling( @@ -507,6 +522,7 @@ class LLMEngine: params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: """Add a request to the engine's request pool. @@ -524,6 +540,7 @@ class LLMEngine: :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + trace_headers: OpenTelemetry trace headers. Details: - Set arrival_time to the current time if it is None. @@ -565,6 +582,7 @@ class LLMEngine: params=params, arrival_time=arrival_time, lora_request=lora_request, + trace_headers=trace_headers, ) def _create_sequence_group_with_sampling( @@ -574,6 +592,7 @@ class LLMEngine: sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest], + trace_headers: Optional[Dict[str, str]] = None, ) -> SequenceGroup: """Creates a SequenceGroup with SamplingParams.""" max_logprobs = self.get_model_config().max_logprobs @@ -595,11 +614,14 @@ class LLMEngine: self.generation_config_fields) # Create the sequence group. - seq_group = SequenceGroup(request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - sampling_params=sampling_params, - lora_request=lora_request) + seq_group = SequenceGroup( + request_id=request_id, + seqs=[seq], + arrival_time=arrival_time, + sampling_params=sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + ) return seq_group @@ -793,6 +815,9 @@ class LLMEngine: # Log stats. self.do_log_stats(scheduler_outputs, output) + # Tracing + self.do_tracing(scheduler_outputs) + if not request_outputs: # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in @@ -986,3 +1011,62 @@ class LLMEngine: def check_health(self) -> None: self.model_executor.check_health() + + def is_tracing_enabled(self) -> bool: + return self.tracer is not None + + def do_tracing(self, scheduler_outputs: SchedulerOutputs) -> None: + if self.tracer is None: + return + + for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: + seq_group = scheduled_seq_group.seq_group + if seq_group.is_finished(): + self.create_trace_span(seq_group) + + def create_trace_span(self, seq_group: SequenceGroup) -> None: + if self.tracer is None or seq_group.sampling_params is None: + return + arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9) + + trace_context = extract_trace_context(seq_group.trace_headers) + + with self.tracer.start_as_current_span( + "llm_request", + kind=SpanKind.SERVER, + context=trace_context, + start_time=arrival_time_nano_seconds) as seq_span: + metrics = seq_group.metrics + ttft = metrics.first_token_time - metrics.arrival_time + e2e_time = metrics.finished_time - metrics.arrival_time + # attribute names are based on + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md + seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, + self.model_config.model) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, + seq_group.request_id) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, + seq_group.sampling_params.temperature) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, + seq_group.sampling_params.top_p) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, + seq_group.sampling_params.max_tokens) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF, + seq_group.sampling_params.best_of) + seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, + seq_group.sampling_params.n) + seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, + seq_group.num_seqs()) + seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, + len(seq_group.prompt_token_ids)) + seq_span.set_attribute( + SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, + sum([ + seq.get_output_len() + for seq in seq_group.get_finished_seqs() + ])) + seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, + metrics.time_in_queue) + seq_span.set_attribute( + SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 769406124..744e1d945 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -31,6 +31,8 @@ from vllm.multimodal.utils import (async_get_and_parse_image, get_full_image_text_prompt) from vllm.outputs import RequestOutput from vllm.sequence import Logprob +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.utils import random_uuid logger = init_logger(__name__) @@ -267,11 +269,20 @@ class OpenAIServingChat(OpenAIServing): if image_data is not None: inputs["multi_modal_data"] = image_data + is_tracing_enabled = await self.engine.is_tracing_enabled() + trace_headers = None + if is_tracing_enabled and raw_request: + trace_headers = extract_trace_headers(raw_request.headers) + if not is_tracing_enabled and raw_request and contains_trace_headers( + raw_request.headers): + log_tracing_disabled_warning() + result_generator = self.engine.generate( inputs, sampling_params, request_id, lora_request, + trace_headers=trace_headers, ) # Streaming response if request.stream: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 64671e21a..c775fa6da 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -24,6 +24,8 @@ from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.outputs import RequestOutput from vllm.sequence import Logprob +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -125,6 +127,14 @@ class OpenAIServingCompletion(OpenAIServing): truncate_prompt_tokens) prompt_ids, prompt_text = prompt_formats + is_tracing_enabled = await self.engine.is_tracing_enabled() + trace_headers = None + if is_tracing_enabled: + trace_headers = extract_trace_headers(raw_request.headers) + if not is_tracing_enabled and contains_trace_headers( + raw_request.headers): + log_tracing_disabled_warning() + generator = self.engine.generate( { "prompt": prompt_text, @@ -133,6 +143,7 @@ class OpenAIServingCompletion(OpenAIServing): sampling_params, f"{request_id}-{i}", lora_request=lora_request, + trace_headers=trace_headers, ) generators.append(generator) diff --git a/vllm/sequence.py b/vllm/sequence.py index 54243bfb1..38d3349f2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -414,6 +414,7 @@ class SequenceGroup: for an embedding model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. + trace_headers: OpenTelemetry trace headers. """ def __init__( @@ -426,6 +427,7 @@ class SequenceGroup: embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, encoder_seq: Optional[Sequence] = None, + trace_headers: Optional[Dict[str, str]] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -441,6 +443,7 @@ class SequenceGroup: self.embeddings = embeddings self.pooling_params = pooling_params self.encoder_seq = encoder_seq + self.trace_headers = trace_headers @property def prompt(self) -> Optional[str]: diff --git a/vllm/tracing.py b/vllm/tracing.py new file mode 100644 index 000000000..ba6732cab --- /dev/null +++ b/vllm/tracing.py @@ -0,0 +1,104 @@ +import os +from typing import Mapping, Optional + +from vllm.logger import init_logger +from vllm.utils import run_once + +TRACE_HEADERS = ["traceparent", "tracestate"] + +logger = init_logger(__name__) + +_is_otel_installed = False +try: + from opentelemetry.context.context import Context + from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes + from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider + from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + _is_otel_installed = True +except ImportError: + + class Context: # type: ignore + pass + + class BaseSpanAttributes: # type: ignore + pass + + class SpanKind: # type: ignore + pass + + class Tracer: # type: ignore + pass + + +def is_otel_installed() -> bool: + return _is_otel_installed + + +def init_tracer(instrumenting_module_name: str, + otlp_traces_endpoint: str) -> Optional[Tracer]: + assert is_otel_installed(), ("OpenTelemetry packages must be installed " + "prior to initializing a tracer") + trace_provider = TracerProvider() + + span_exporter = get_span_exporter(otlp_traces_endpoint) + trace_provider.add_span_processor(BatchSpanProcessor(span_exporter)) + set_tracer_provider(trace_provider) + + tracer = trace_provider.get_tracer(instrumenting_module_name) + return tracer + + +def get_span_exporter(endpoint): + protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc") + if protocol == "grpc": + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) + elif protocol == "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter) + else: + raise ValueError( + f"Unsupported OTLP protocol '{protocol}' is configured") + + return OTLPSpanExporter(endpoint=endpoint) + + +def extract_trace_context( + headers: Optional[Mapping[str, str]]) -> Optional[Context]: + if is_otel_installed(): + headers = headers or {} + return TraceContextTextMapPropagator().extract(headers) + else: + return None + + +def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: + + return {h: headers[h] for h in TRACE_HEADERS if h in headers} + + +class SpanAttributes(BaseSpanAttributes): + # The following span attribute names are added here because they are missing + # from the Semantic Conventions for LLM. + LLM_REQUEST_ID = "gen_ai.request.id" + LLM_REQUEST_BEST_OF = "gen_ai.request.best_of" + LLM_REQUEST_N = "gen_ai.request.n" + LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + LLM_LATENCY_E2E = "gen_ai.latency.e2e" + + +def contains_trace_headers(headers: Mapping[str, str]) -> bool: + return any(h in headers for h in TRACE_HEADERS) + + +@run_once +def log_tracing_disabled_warning() -> None: + logger.warning( + "Received a request with trace context but tracing is disabled") diff --git a/vllm/utils.py b/vllm/utils.py index 1adfa9218..ffe921e65 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -763,3 +763,15 @@ def cuda_device_count_stateless() -> int: # after https://github.com/pytorch/pytorch/pull/122815 is released. return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) + + +#From: https://stackoverflow.com/a/4104188/2749989 +def run_once(f): + + def wrapper(*args, **kwargs) -> Any: + if not wrapper.has_run: # type: ignore[attr-defined] + wrapper.has_run = True # type: ignore[attr-defined] + return f(*args, **kwargs) + + wrapper.has_run = False # type: ignore[attr-defined] + return wrapper -- GitLab From 95db455e7f337e99ffafd0b14367a7cbc11dca43 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Jun 2024 12:45:05 -0400 Subject: [PATCH 086/376] [Misc] Add channel-wise quantization support for w8a8 dynamic per token activation quantization (#5542) --- tests/quantization/test_compressed_tensors.py | 13 +++++-- vllm/model_executor/layers/linear.py | 13 ------- .../compressed_tensors/compressed_tensors.py | 14 ++++--- .../compressed_tensors_w8a8_dynamictoken.py | 37 ++++++++++++++----- 4 files changed, 45 insertions(+), 32 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 611c6b8b7..b78081155 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso def test_compressed_tensors_w8a8_static_setup(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" + model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path, enforce_eager=True) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -43,15 +43,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner): def test_compressed_tensors_no_enforce_eager(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" + model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path) as llm: sampling_params = SamplingParams() output = llm.generate("Hello world!", sampling_params=sampling_params) assert output -def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" +@pytest.mark.parametrize("model_args", [ + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), + ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), +]) +def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): + model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -60,6 +64,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) + assert qkv_proj.scheme.strategy == strategy assert qkv_proj.weight.dtype is torch.int8 diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 58c379bcd..45f805547 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -468,13 +468,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): "MergedColumnParallelLinear, assume the weight is " "the same for all partitions.") - if fp8_scales_shard_indexer is None: - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -686,12 +679,6 @@ class QKVParallelLinear(ColumnParallelLinear): "QKVParallelLinear, assume the weight is the same " "for all partitions.") - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 92a84b3c0..347a052a6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -95,14 +95,15 @@ class CompressedTensorsConfig(QuantizationConfig): def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - is_token_tensor = (weight_quant.strategy - == QuantizationStrategy.TENSOR.value) and ( - input_quant.strategy - == QuantizationStrategy.TOKEN.value) + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_token = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic - return is_8_bits and is_token_tensor and is_symmetric and is_dynamic + return is_8_bits and is_token and is_symmetric and is_dynamic def _is_w4a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -133,7 +134,8 @@ class CompressedTensorsConfig(QuantizationConfig): return CompressedTensorsW8A8StaticTensor() if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken() + return CompressedTensorsW8A8DynamicToken( + strategy=weight_quant.strategy) raise NotImplementedError( "No compressed-tensors compatible scheme was found.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index d514d7b28..37610c9c2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -6,6 +6,8 @@ from torch.nn import Parameter from vllm import _custom_ops as custom_ops from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8DynamicToken"] @@ -13,6 +15,9 @@ __all__ = ["CompressedTensorsW8A8DynamicToken"] class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): + def __init__(self, strategy: str): + self.strategy = strategy + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: if isinstance(shard_id, int): return shard_id @@ -45,11 +50,17 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): # CompressedTensorsW8A8StaticTensor::create_weights for further # information. is_tensor_partitioned = len(output_partition_sizes) != 1 - weight_scale_dim = sum( - output_partition_sizes) if is_tensor_partitioned else 1 + # when doing channel-wise quantization, number of scales + # is equal to output_dim + weight_scale_dim = sum(output_partition_sizes) if ( + is_tensor_partitioned + or self.strategy == QuantizationStrategy.CHANNEL) else 1 + + shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, ) + if self.strategy == QuantizationStrategy.CHANNEL: + shape = (weight_scale_dim, 1) - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), + weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32), requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), @@ -67,12 +78,20 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): }) layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs( - weight_scale, { - "weight_loader": weight_loader, - "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes + set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + + # Don't need a shard_splitter for channel-wise quantization + # Use the default loading method + if self.strategy == QuantizationStrategy.CHANNEL: + set_weight_attrs(weight_scale, { + "output_dim": 0, }) + else: + set_weight_attrs( + weight_scale, { + "logical_widths": output_partition_sizes, + "shard_splitter": self.scales_shard_splitter, + }) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight -- GitLab From 19091efc44c6f9b1e008dc5469c63a1f01684745 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 18 Jun 2024 11:00:36 -0700 Subject: [PATCH 087/376] [ci] Setup Release pipeline and build release wheels with cache (#5610) Signed-off-by: kevin --- .buildkite/release-pipeline.yaml | 21 ++++++++++++ Dockerfile | 58 ++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 17 deletions(-) create mode 100644 .buildkite/release-pipeline.yaml diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml new file mode 100644 index 000000000..1959f9752 --- /dev/null +++ b/.buildkite/release-pipeline.yaml @@ -0,0 +1,21 @@ +steps: + - block: "Build wheels" + + - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host" + - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" + matrix: + setup: + cuda_version: + - "11.8.0" + - "12.1.0" + python_version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" diff --git a/Dockerfile b/Dockerfile index 72894e7ca..5b3e682a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,26 @@ # docs/source/dev/dockerfile/dockerfile.rst and # docs/source/assets/dev/dockerfile-stages-dependency.png +ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment -FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base + +ARG CUDA_VERSION=12.4.1 +ARG PYTHON_VERSION=3 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \ + && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \ + && python3 --version \ + && python3 -m pip --version RUN apt-get update -y \ && apt-get install -y python3-pip git curl sudo @@ -16,7 +33,7 @@ RUN apt-get update -y \ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace @@ -24,14 +41,7 @@ WORKDIR /workspace COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-cuda.txt - -# install development dependencies -COPY requirements-lint.txt requirements-lint.txt -COPY requirements-test.txt requirements-test.txt -COPY requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt + python3 -m pip install -r requirements-cuda.txt # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -41,14 +51,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} #################### BASE BUILD IMAGE #################### - #################### WHEEL BUILD IMAGE #################### -FROM dev AS build +FROM base AS build + +ARG PYTHON_VERSION=3 # install build dependencies COPY requirements-build.txt requirements-build.txt + RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-build.txt + python3 -m pip install -r requirements-build.txt # install compiler cache to speed up compilation leveraging local or remote caching RUN apt-get update -y && apt-get install -y ccache @@ -101,9 +113,21 @@ RUN python3 check-wheel-size.py dist #################### EXTENSION Build IMAGE #################### +#################### DEV IMAGE #################### +FROM base as dev + +COPY requirements-lint.txt requirements-lint.txt +COPY requirements-test.txt requirements-test.txt +COPY requirements-dev.txt requirements-dev.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-dev.txt + +#################### DEV IMAGE #################### + #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +ARG CUDA_VERSION=12.4.1 WORKDIR /vllm-workspace RUN apt-get update -y \ @@ -113,12 +137,12 @@ RUN apt-get update -y \ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # install vllm wheel first, so that torch etc will be installed RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ - pip install dist/*.whl --verbose + python3 -m pip install dist/*.whl --verbose #################### vLLM installation IMAGE #################### @@ -131,7 +155,7 @@ ADD . /vllm-workspace/ # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt + python3 -m pip install -r requirements-dev.txt # doc requires source code # we hide them inside `test_docs/` , so that this source code -- GitLab From 07feecde1a69859d565786a7ad64c0f604f17b28 Mon Sep 17 00:00:00 2001 From: sergey-tinkoff <167607910+sergey-tinkoff@users.noreply.github.com> Date: Tue, 18 Jun 2024 21:01:21 +0300 Subject: [PATCH 088/376] [Model] LoRA support added for command-r (#5178) --- csrc/punica/bgmv/bgmv_config.h | 6 ++++ tests/lora/test_punica.py | 2 ++ vllm/model_executor/models/commandr.py | 48 ++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 6 deletions(-) mode change 100644 => 100755 csrc/punica/bgmv/bgmv_config.h diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h old mode 100644 new mode 100755 index 0456b4bc2..c38db2dcd --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -69,6 +69,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 36864) \ f(in_T, out_T, W_T, narrow, 43264) \ f(in_T, out_T, W_T, narrow, 49152) \ + f(in_T, out_T, W_T, narrow, 60544) \ + f(in_T, out_T, W_T, narrow, 60672) \ f(in_T, out_T, W_T, narrow, 64000) \ f(in_T, out_T, W_T, narrow, 64256) \ f(in_T, out_T, W_T, narrow, 64512) \ @@ -78,6 +80,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128000) \ f(in_T, out_T, W_T, narrow, 128256) \ f(in_T, out_T, W_T, narrow, 128512) \ + + // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py @@ -144,6 +148,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 36864, narrow) \ f(in_T, out_T, W_T, 43264, narrow) \ f(in_T, out_T, W_T, 49152, narrow) \ + f(in_T, out_T, W_T, 60544, narrow) \ + f(in_T, out_T, W_T, 60672, narrow) \ f(in_T, out_T, W_T, 64000, narrow) \ f(in_T, out_T, W_T, 64256, narrow) \ f(in_T, out_T, W_T, 64512, narrow) \ diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index d87658e5d..dae1d5687 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -94,6 +94,8 @@ H1 = H2 = [ 36864, 43264, 49152, + 60544, + 60672, 64000, 64256, 102400, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 11d88d45e..600c2990b 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -29,7 +29,7 @@ from torch.nn.parameter import Parameter from transformers import CohereConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -265,10 +265,14 @@ class CohereModel(nn.Module): config: CohereConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ): super().__init__() self.config = config - self.vocab_size = config.vocab_size + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) self.layers = nn.ModuleList([ @@ -302,18 +306,44 @@ class CohereModel(nn.Module): class CohereForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" + ] + embedding_modules = {"embed_tokens": "input_embeddings"} + embedding_padding_modules = [] + def __init__( self, config: CohereConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.quant_config = quant_config - self.logits_processor = LogitsProcessor(config.vocab_size, + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, cache_config, quant_config) + self.model = CohereModel(config, + cache_config, + quant_config, + lora_config=lora_config) self.sampler = Sampler() @torch.no_grad() @@ -330,8 +360,14 @@ class CohereForCausalLM(nn.Module): def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.model.embed_tokens.weight, - hidden_states, sampling_metadata) + is_not_lora = hasattr(self.model.embed_tokens, 'weight') + if is_not_lora: + embedding_weights = self.model.embed_tokens.weight + else: + embedding_weights = self.model.embed_tokens.base_layer.weight + + logits = self.logits_processor(embedding_weights, hidden_states, + sampling_metadata) return logits def sample( -- GitLab From 8a173382c80d6730e1bbc81f932ac3721ab2cd9d Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 18 Jun 2024 23:18:37 +0200 Subject: [PATCH 089/376] [Bugfix] Fix for inconsistent behaviour related to sampling and repetition penalties (#5639) Signed-off-by: Thomas Parnell --- vllm/model_executor/sampling_metadata.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7ad84f51b..f95de56f3 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -386,18 +386,10 @@ class SamplingTensors: presence_penalties += [0] * prefill_len frequency_penalties += [0] * prefill_len repetition_penalties += [1] * prefill_len - if do_penalties: - prompt_tokens.extend([] for _ in range(prefill_len)) - output_tokens.extend([] for _ in range(prefill_len)) if seq_group.do_sample: sample_lens = len(seq_group.sample_indices) assert sample_lens == len(seq_ids) - for seq_id in seq_ids: - seq_data = seq_group.seq_data[seq_id] - if do_penalties: - prompt_tokens.append(seq_data.prompt_token_ids) - output_tokens.append(seq_data.output_token_ids) temperatures += [temperature] * len(seq_ids) top_ps += [top_p] * len(seq_ids) top_ks += [top_k] * len(seq_ids) @@ -424,6 +416,20 @@ class SamplingTensors: sampling_seeds.append(seq_seeds) sample_indices.extend(seq_group.sample_indices) + if do_penalties: + for seq_group in sampling_metadata.seq_groups: + seq_ids = seq_group.seq_ids + if (seq_group.is_prompt + and sampling_params.prompt_logprobs is not None): + prefill_len = len(seq_group.prompt_logprob_indices) + prompt_tokens.extend([] for _ in range(prefill_len)) + output_tokens.extend([] for _ in range(prefill_len)) + if seq_group.do_sample: + for seq_id in seq_ids: + seq_data = seq_group.seq_data[seq_id] + prompt_tokens.append(seq_data.prompt_token_ids) + output_tokens.append(seq_data.output_token_ids) + sampling_tensors = SamplingTensors.from_lists( temperatures, top_ps, top_ks, min_ps, presence_penalties, frequency_penalties, repetition_penalties, sampling_seeds, -- GitLab From 2bd231a7b7787407ccba36f966603578842d03f7 Mon Sep 17 00:00:00 2001 From: milo157 <43028253+milo157@users.noreply.github.com> Date: Tue, 18 Jun 2024 18:56:59 -0400 Subject: [PATCH 090/376] [Doc] Added cerebrium as Integration option (#5553) --- .../serving/deploying_with_cerebrium.rst | 109 ++++++++++++++++++ docs/source/serving/integrations.rst | 1 + 2 files changed, 110 insertions(+) create mode 100644 docs/source/serving/deploying_with_cerebrium.rst diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst new file mode 100644 index 000000000..ff0ac9111 --- /dev/null +++ b/docs/source/serving/deploying_with_cerebrium.rst @@ -0,0 +1,109 @@ +.. _deploying_with_cerebrium: + +Deploying with Cerebrium +============================ + +.. raw:: html + +

+ vLLM_plus_cerebrium +

+ +vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +.. code-block:: console + + $ pip install cerebrium + $ cerebrium login + +Next, create your Cerebrium project, run: + +.. code-block:: console + + $ cerebrium init vllm-project + +Next, to install the required packages, add the following to your cerebrium.toml: + +.. code-block:: toml + + [cerebrium.dependencies.pip] + vllm = "latest" + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + + def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} + + +Then, run the following code to deploy it to the cloud + +.. code-block:: console + + $ cerebrium deploy + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +.. code-block:: python + + curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' + +You should get a response like: + +.. code-block:: python + + { + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 + } + +You now have an autoscaling endpoint where you only pay for the compute you use! + diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 83a8b5a88..680ea523d 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -8,6 +8,7 @@ Integrations deploying_with_kserve deploying_with_triton deploying_with_bentoml + deploying_with_cerebrium deploying_with_lws deploying_with_dstack serving_with_langchain -- GitLab From b23ce9203235488e080434108d3504d54b24e867 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 18 Jun 2024 19:48:49 -0400 Subject: [PATCH 091/376] [Bugfix] Fix CUDA version check for mma warning suppression (#5642) --- csrc/quantization/marlin/sparse/common/mma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 8a6c65338..b26505f77 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -17,6 +17,7 @@ #pragma once #include "base.h" +#include namespace marlin_24 { @@ -26,7 +27,7 @@ namespace marlin_24 { // | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction // | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially // | reduced performance on some future architectures -#if defined CUDA_VERSION && CUDA_VERSION >= 12500 +#if defined CUDA_VERSION && CUDA_VERSION >= 12050 #define MMA_SP_INST \ "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " #else -- GitLab From 6820724e51079120251c8522afd385ca64abc948 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 18 Jun 2024 20:33:25 -0400 Subject: [PATCH 092/376] [Bugfix] Fix w8a8 benchmarks for int8 case (#5643) --- benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 523e970c2..5cc0fbbd4 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -120,9 +120,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # cutlass impl timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_i8_i8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm")) return timers -- GitLab From 59a1eb59c9cb383e5ea36d7253f81ff2ea7766cc Mon Sep 17 00:00:00 2001 From: Shukant Pal Date: Tue, 18 Jun 2024 18:46:38 -0700 Subject: [PATCH 093/376] [Bugfix] Fix Phi-3 Long RoPE scaling implementation (#5628) --- vllm/model_executor/layers/rotary_embedding.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 9c0a74cda..a0b19046b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -507,8 +507,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): dtype: torch.dtype, short_factor: List[float], long_factor: List[float], - short_mscale: float = 1.1, - long_mscale: float = 1.225, + short_mscale: float = 1.0, + long_mscale: float = 1.0, ): super().__init__() @@ -530,6 +530,16 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): self.short_mscale = short_mscale self.long_mscale = long_mscale + scale = (self.max_position_embeddings / + self.original_max_position_embeddings) + + if scale <= 1.0: + self.scaling_factor = 1.0 + else: + self.scaling_factor = math.sqrt( + 1 + math.log(scale) / + math.log(self.original_max_position_embeddings)) + short_cache = self._compute_cos_sin_cache( original_max_position_embeddings, short_factor, short_mscale) short_cache = short_cache.to(dtype) @@ -565,8 +575,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): inv_freq = self._compute_inv_freq(rescale_factors) t = torch.arange(max_position_embeddings, dtype=torch.float) freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() * mscale - sin = freqs.sin() * mscale + cos = freqs.cos() * mscale * self.scaling_factor + sin = freqs.sin() * mscale * self.scaling_factor cache = torch.cat((cos, sin), dim=-1) return cache -- GitLab From e5150f2c281f052df42121ae60827156abe57173 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 19 Jun 2024 08:03:55 +0200 Subject: [PATCH 094/376] [Bugfix] Added test for sampling repetition penalty bug. (#5659) Signed-off-by: Thomas Parnell --- tests/samplers/test_sampler.py | 69 ++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index c6ef4358e..0aabde6aa 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -631,3 +631,72 @@ def test_sampler_top_k_top_p(seed: int, device: str): hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) assert torch.allclose(hf_probs, sample_probs, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_repetition_penalty_mixed(device: str): + + vocab_size = 8 + + def test_sampling_params(sampling_params: List[SamplingParams]): + + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] + for i in range(2): + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData([1, 2, 3])}, + sampling_params=sampling_params[i], + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available()) + + fake_logits = torch.full((2, vocab_size), + 1e-2, + device=device, + dtype=torch.float16) + + fake_logits[:, 5] = 1.1e-2 + fake_logits[:, 1] = 1.2e-2 + + sampler = MockLogitsSampler(fake_logits) + + sampler_output = sampler(logits=fake_logits, + sampling_metadata=sampling_metadata) + + generated_tokens = [] + for output in sampler_output: + generated_tokens.append(output.samples[0].output_token) + + return generated_tokens + + # one configuration is greedy with repetition_penalty + sampling_params_rep = SamplingParams( + temperature=0.0, + repetition_penalty=2.0, + ) + + # other configuration is sampling w/o repetition_penalty + sampling_params_sample = SamplingParams( + temperature=1.0, + top_k=1, + seed=42, + ) + + tokens1 = test_sampling_params( + [sampling_params_rep, sampling_params_sample]) + + tokens2 = test_sampling_params( + [sampling_params_sample, sampling_params_rep]) + + assert tokens1[0] == tokens2[1] + assert tokens1[1] == tokens2[0] -- GitLab From f758aed0e851687e919a4ee09ab872ee2c8fe159 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Wed, 19 Jun 2024 02:21:29 -0400 Subject: [PATCH 095/376] [Bugfix][CI/Build][AMD][ROCm]Fixed the cmake build bug which generate garbage on certain devices (#5641) --- Dockerfile.rocm | 17 ++++++++--------- cmake/utils.cmake | 5 ++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 724fa1673..6bda69685 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -7,9 +7,8 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" RUN echo "Base image is $BASE_IMAGE" -# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" -# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ARG FA_GFX_ARCHS="gfx90a;gfx942" RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" @@ -68,7 +67,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ + && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \ patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ && python3 setup.py install \ && cd ..; \ @@ -76,7 +75,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ +RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi # build triton @@ -107,11 +106,11 @@ ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ + && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \ && python3 setup.py install \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \ + && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \ + && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \ && cd .. diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f3c1286dd..071e16336 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # Find the intersection of the supported + detected architectures to # set the module architecture flags. # + + set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") + set(${GPU_ARCHES}) - foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES}) + foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() -- GitLab From 3eea74889fe29534808bae41fca251e0e74c0962 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 19 Jun 2024 01:05:00 -0700 Subject: [PATCH 096/376] [misc][distributed] use 127.0.0.1 for single-node (#5619) --- vllm/executor/multiproc_gpu_executor.py | 7 +++++-- vllm/executor/ray_gpu_executor.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 8385e56f8..e63e5a3a0 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -10,7 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (cuda_device_count_stateless, - get_distributed_init_method, get_ip, get_open_port, + get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async) logger = init_logger(__name__) @@ -37,8 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): assert world_size <= cuda_device_count_stateless(), ( "please set tensor_parallel_size to less than max local gpu count") + # Multiprocessing-based executor does not support multi-node setting. + # Since it only works for single node, we can use the loopback address + # 127.0.0.1 for communication. distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) + "127.0.0.1", get_open_port()) if world_size == 1: self.workers = [] diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 843332e5e..fc83c5528 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -161,6 +161,16 @@ class RayGPUExecutor(DistributedGPUExecutor): self._run_workers("update_environment_variables", all_args=all_args_to_update_environment_variables) + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) -- GitLab From da971ec7a5b35f33981cff9ca50064d3166953f9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 19 Jun 2024 05:38:26 -0400 Subject: [PATCH 097/376] [Model] Add FP8 kv cache for Qwen2 (#5656) --- vllm/model_executor/models/qwen2.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9a4829a27..b5d13bb6b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -46,6 +46,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput +from vllm.utils import print_warning_once class Qwen2MLP(nn.Module): @@ -375,6 +376,19 @@ class Qwen2ForCausalLM(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + f"Found kv scale in the checkpoint (e.g. {name}), " + "but not found the expected name in the model " + f"(e.g. {remapped_kv_scale_name}). kv-scale is " + "not loaded.") + continue + else: + name = remapped_kv_scale_name param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) -- GitLab From 7d46c8d37864993162bbeb61dc19b5ad6043646d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 19 Jun 2024 17:58:32 +0800 Subject: [PATCH 098/376] [Bugfix] Fix sampling_params passed incorrectly in Phi3v example (#5684) --- examples/phi3v_example.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py index d5e60ae1e..4f37c47dd 100644 --- a/examples/phi3v_example.py +++ b/examples/phi3v_example.py @@ -12,7 +12,6 @@ def run_phi3v(): llm = LLM( model=model_path, trust_remote_code=True, - max_model_len=4096, image_input_type="pixel_values", image_token_id=32044, image_input_shape="1,3,1008,1344", @@ -28,11 +27,12 @@ def run_phi3v(): sampling_params = SamplingParams(temperature=0, max_tokens=64) - outputs = llm.generate({ - "prompt": prompt, - "sampling_params": sampling_params, - "multi_modal_data": ImagePixelData(image), - }) + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImagePixelData(image), + }, + sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text print(generated_text) -- GitLab From d8714530d11603a159a46ea0dde299f95807cfde Mon Sep 17 00:00:00 2001 From: DearPlanet Date: Wed, 19 Jun 2024 18:19:08 +0800 Subject: [PATCH 099/376] [Misc]Add param max-model-len in benchmark_latency.py (#5629) --- benchmarks/benchmark_latency.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 98e0be277..e9d1048c8 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -29,6 +29,7 @@ def main(args: argparse.Namespace): tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code, dtype=args.dtype, + max_model_len=args.max_model_len, enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, quantization_param_path=args.quantization_param_path, @@ -150,6 +151,12 @@ if __name__ == '__main__': parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') parser.add_argument( '--dtype', type=str, -- GitLab From e9c2732b976612b6362635be2984f03bfabc20ec Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 19 Jun 2024 22:37:33 +0800 Subject: [PATCH 100/376] [CI/Build] Add tqdm to dependencies (#5680) --- requirements-common.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-common.txt b/requirements-common.txt index 32e2ebe8c..05969cfa5 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -4,6 +4,7 @@ psutil sentencepiece # Required for LLaMA tokenizer. numpy < 2.0.0 requests +tqdm py-cpuinfo transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. tokenizers >= 0.19.1 # Required for Llama 3. -- GitLab From 3ee5c4bca514ee95592a018fae95e050fd6763c0 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 19 Jun 2024 07:42:13 -0700 Subject: [PATCH 101/376] [ci] Add A100 queue into AWS CI template (#5648) Signed-off-by: kevin --- .../benchmark-pipeline.yaml | 1 + .buildkite/test-pipeline.yaml | 5 ++ .buildkite/test-template-aws.j2 | 46 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 8f12748b6..2b25c954b 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -17,6 +17,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT command: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5afe37302..c2160fee3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -181,3 +181,8 @@ steps: commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: A100 status + gpu: a100 + commands: + - nvidia-smi diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 01f7ff1e0..08146bf44 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -49,6 +49,51 @@ steps: command: bash .buildkite/run-cpu-test.sh {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" + agents: + queue: a100-queue + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: {{ docker_image }} + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + {% else %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} @@ -90,4 +135,5 @@ steps: {% endif %} volumes: - /dev/shm:/dev/shm + {% endif %} {% endfor %} -- GitLab From afed90a0344b1b0ce6aae46efc630adb489ec769 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 19 Jun 2024 14:41:42 -0400 Subject: [PATCH 102/376] [Frontend][Bugfix] Fix preemption_mode -> preemption-mode for CLI arg in arg_utils.py (#5688) --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 647793a6d..7f760c277 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -577,7 +577,7 @@ class EngineArgs: 'This should be a JSON string that will be ' 'parsed into a dictionary.') parser.add_argument( - '--preemption_mode', + '--preemption-mode', type=str, default=None, help='If \'recompute\', the engine performs preemption by block ' -- GitLab From d571ca010813c5532c646ee74c1a2c9e1e78e12a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 19 Jun 2024 13:16:04 -0700 Subject: [PATCH 103/376] [ci][distributed] add tests for custom allreduce (#5689) --- .buildkite/test-pipeline.yaml | 8 ++++++-- tests/distributed/test_custom_all_reduce.py | 7 ++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c2160fee3..b1602dd94 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -182,7 +182,11 @@ steps: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html -- label: A100 status +- label: Distributed Tests (A100) gpu: a100 commands: - - nvidia-smi + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 3776c1f91..9a39160b8 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -11,7 +11,8 @@ from vllm.distributed.communication_op import ( # noqa from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_group, graph_capture) -from ..utils import (init_test_distributed_environment, +from ..utils import (ensure_model_parallel_initialized, + init_test_distributed_environment, multi_process_tensor_parallel) random.seed(42) @@ -27,8 +28,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) - - group = get_tensor_model_parallel_group() + ensure_model_parallel_initialized(tp_size, pp_size) + group = get_tensor_model_parallel_group().device_group # A small all_reduce for warmup. # this is needed because device communicators might be created lazily -- GitLab From 78687504f7eb6d7523bff15b1bca8c9cbb74656a Mon Sep 17 00:00:00 2001 From: zifeitong Date: Wed, 19 Jun 2024 13:57:12 -0700 Subject: [PATCH 104/376] [Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) --- tests/async_engine/test_async_llm_engine.py | 38 +++- tests/spec_decode/e2e/conftest.py | 43 +---- tests/utils.py | 43 ++++- vllm/engine/async_llm_engine.py | 5 +- vllm/engine/async_timeout.py | 189 ++++++++++++++++++++ 5 files changed, 271 insertions(+), 47 deletions(-) create mode 100644 vllm/engine/async_timeout.py diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 10a464228..52d3394a9 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -2,8 +2,12 @@ import asyncio from dataclasses import dataclass import pytest +import torch -from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm import SamplingParams +from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine + +from ..utils import wait_for_gpu_memory_to_clear @dataclass @@ -94,3 +98,35 @@ async def test_new_requests_event(): assert engine.get_model_config() is not None assert engine.get_tokenizer() is not None assert engine.get_decoding_config() is not None + + +def test_asyncio_run(): + wait_for_gpu_memory_to_clear( + devices=list(range(torch.cuda.device_count())), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + engine = AsyncLLMEngine.from_engine_args( + AsyncEngineArgs(model="facebook/opt-125m")) + + async def run(prompt: str): + sampling_params = SamplingParams( + temperature=0, + max_tokens=32, + ) + + async for output in engine.generate(prompt, + sampling_params, + request_id=prompt): + final_output = output + return final_output + + async def generate(): + return await asyncio.gather( + run("test0"), + run("test1"), + ) + + results = asyncio.run(generate()) + assert len(results) == 2 diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 86103cf85..60dfe33f2 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,5 +1,4 @@ import asyncio -import time from itertools import cycle from typing import Dict, List, Optional, Tuple, Union @@ -7,12 +6,6 @@ import pytest import ray import torch -from vllm.utils import is_hip - -if (not is_hip()): - from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlInit) - from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -26,6 +19,7 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, random_uuid from ...conftest import cleanup +from ...utils import wait_for_gpu_memory_to_clear class AsyncLLM: @@ -291,38 +285,3 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, print(f'{i=} {baseline_token_ids=}') print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids - - -def wait_for_gpu_memory_to_clear(devices: List[int], - threshold_bytes: int, - timeout_s: float = 120) -> None: - # Use nvml instead of pytorch to reduce measurement error from torch cuda - # context. - nvmlInit() - start_time = time.time() - while True: - output: Dict[int, str] = {} - output_raw: Dict[int, float] = {} - for device in devices: - dev_handle = nvmlDeviceGetHandleByIndex(device) - mem_info = nvmlDeviceGetMemoryInfo(dev_handle) - gb_used = mem_info.used / 2**30 - output_raw[device] = gb_used - output[device] = f'{gb_used:.02f}' - - print('gpu memory used (GB): ', end='') - for k, v in output.items(): - print(f'{k}={v}; ', end='') - print('') - - dur_s = time.time() - start_time - if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()): - print(f'Done waiting for free GPU memory on devices {devices=} ' - f'({threshold_bytes/2**30=}) {dur_s=:.02f}') - break - - if dur_s >= timeout_s: - raise ValueError(f'Memory of devices {devices=} not free after ' - f'{dur_s=:.02f} ({threshold_bytes/2**30=})') - - time.sleep(5) diff --git a/tests/utils.py b/tests/utils.py index f2b2d22b1..bc30515c8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ import sys import time import warnings from contextlib import contextmanager -from typing import List +from typing import Dict, List import openai import ray @@ -13,7 +13,11 @@ import requests from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.utils import get_open_port +from vllm.utils import get_open_port, is_hip + +if (not is_hip()): + from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, + nvmlInit) # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) @@ -154,3 +158,38 @@ def error_on_warning(): warnings.simplefilter("error") yield + + +def wait_for_gpu_memory_to_clear(devices: List[int], + threshold_bytes: int, + timeout_s: float = 120) -> None: + # Use nvml instead of pytorch to reduce measurement error from torch cuda + # context. + nvmlInit() + start_time = time.time() + while True: + output: Dict[int, str] = {} + output_raw: Dict[int, float] = {} + for device in devices: + dev_handle = nvmlDeviceGetHandleByIndex(device) + mem_info = nvmlDeviceGetMemoryInfo(dev_handle) + gb_used = mem_info.used / 2**30 + output_raw[device] = gb_used + output[device] = f'{gb_used:.02f}' + + print('gpu memory used (GB): ', end='') + for k, v in output.items(): + print(f'{k}={v}; ', end='') + print('') + + dur_s = time.time() - start_time + if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()): + print(f'Done waiting for free GPU memory on devices {devices=} ' + f'({threshold_bytes/2**30=}) {dur_s=:.02f}') + break + + if dur_s >= timeout_s: + raise ValueError(f'Memory of devices {devices=} not free after ' + f'{dur_s=:.02f} ({threshold_bytes/2**30=})') + + time.sleep(5) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 86720e4fb..df25eb111 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.llm_engine import LLMEngine from vllm.executor.ray_utils import initialize_ray_cluster, ray from vllm.inputs import LLMInputs, PromptInputs @@ -545,8 +546,8 @@ class AsyncLLMEngine: # Abort if iteration takes too long due to unrecoverable errors # (eg. NCCL timeouts). try: - has_requests_in_progress = await asyncio.wait_for( - self.engine_step(), ENGINE_ITERATION_TIMEOUT_S) + async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S): + has_requests_in_progress = await self.engine_step() except asyncio.TimeoutError as exc: logger.error( "Engine iteration timed out. This should never happen!") diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py new file mode 100644 index 000000000..4b1842625 --- /dev/null +++ b/vllm/engine/async_timeout.py @@ -0,0 +1,189 @@ +# Workaround for https://github.com/python/cpython/issues/86296 +# +# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py +# Licensed under the Apache License (Apache-2.0) + +import asyncio +import enum +import sys +import warnings +from types import TracebackType +from typing import Any, Optional, Type + +if sys.version_info[:2] >= (3, 11): + from asyncio import timeout as asyncio_timeout +else: + + def asyncio_timeout(delay: Optional[float]) -> "Timeout": + """timeout context manager. + Useful in cases when you want to apply timeout logic around block + of code or in cases when asyncio.wait_for is not suitable. For example: + >>> async with timeout(0.001): + ... async with aiohttp.get('https://github.com') as r: + ... await r.text() + delay - value in seconds or None to disable timeout logic + """ + loop = asyncio.get_running_loop() + deadline = loop.time() + delay if delay is not None else None + return Timeout(deadline, loop) + + class _State(enum.Enum): + INIT = "INIT" + ENTER = "ENTER" + TIMEOUT = "TIMEOUT" + EXIT = "EXIT" + + class Timeout: + # Internal class, please don't instantiate it directly + # Use timeout() and timeout_at() public factories instead. + # + # Implementation note: `async with timeout()` is preferred + # over `with timeout()`. + # While technically the Timeout class implementation + # doesn't need to be async at all, + # the `async with` statement explicitly points that + # the context manager should be used from async function context. + # + # This design allows to avoid many silly misusages. + # + # TimeoutError is raised immediately when scheduled + # if the deadline is passed. + # The purpose is to time out as soon as possible + # without waiting for the next await expression. + + __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") + + def __init__(self, deadline: Optional[float], + loop: asyncio.AbstractEventLoop) -> None: + self._loop = loop + self._state = _State.INIT + + self._timeout_handler = None # type: Optional[asyncio.Handle] + if deadline is None: + self._deadline = None # type: Optional[float] + else: + self.update(deadline) + + def __enter__(self) -> "Timeout": + warnings.warn( + "with timeout() is deprecated, use async with timeout()", + DeprecationWarning, + stacklevel=2, + ) + self._do_enter() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + async def __aenter__(self) -> "Timeout": + self._do_enter() + return self + + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + @property + def expired(self) -> bool: + """Is timeout expired during execution?""" + return self._state == _State.TIMEOUT + + @property + def deadline(self) -> Optional[float]: + return self._deadline + + def reject(self) -> None: + """Reject scheduled timeout if any.""" + # cancel is maybe better name but + # task.cancel() raises CancelledError in asyncio world. + if self._state not in (_State.INIT, _State.ENTER): + raise RuntimeError(f"invalid state {self._state.value}") + self._reject() + + def _reject(self) -> None: + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._timeout_handler = None + + def shift(self, delay: float) -> None: + """Advance timeout on delay seconds. + The delay can be negative. + Raise RuntimeError if shift is called when deadline is not scheduled + """ + deadline = self._deadline + if deadline is None: + raise RuntimeError( + "cannot shift timeout if deadline is not scheduled") + self.update(deadline + delay) + + def update(self, deadline: float) -> None: + """Set deadline to absolute value. + deadline argument points on the time in the same clock system + as loop.time(). + If new deadline is in the past the timeout is raised immediately. + Please note: it is not POSIX time but a time with + undefined starting base, e.g. the time of the system power on. + """ + if self._state == _State.EXIT: + raise RuntimeError( + "cannot reschedule after exit from context manager") + if self._state == _State.TIMEOUT: + raise RuntimeError("cannot reschedule expired timeout") + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._deadline = deadline + if self._state != _State.INIT: + self._reschedule() + + def _reschedule(self) -> None: + assert self._state == _State.ENTER + deadline = self._deadline + if deadline is None: + return + + now = self._loop.time() + if self._timeout_handler is not None: + self._timeout_handler.cancel() + + task = asyncio.current_task() + if deadline <= now: + self._timeout_handler = self._loop.call_soon( + self._on_timeout, task) + else: + self._timeout_handler = self._loop.call_at( + deadline, self._on_timeout, task) + + def _do_enter(self) -> None: + if self._state != _State.INIT: + raise RuntimeError(f"invalid state {self._state.value}") + self._state = _State.ENTER + self._reschedule() + + def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: + if exc_type is asyncio.CancelledError and \ + self._state == _State.TIMEOUT: + self._timeout_handler = None + raise asyncio.TimeoutError + # timeout has not expired + self._state = _State.EXIT + self._reject() + return None + + def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: + if task: + task.cancel() + self._state = _State.TIMEOUT + # drop the reference early + self._timeout_handler = None -- GitLab From e83db9e7e3d776cd9b059a49024f3950ef579b41 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 19 Jun 2024 18:01:45 -0400 Subject: [PATCH 105/376] [Doc] Update docker references (#5614) Signed-off-by: Rafael Vasquez --- docs/source/dev/dockerfile/dockerfile.rst | 20 +++++++++---------- docs/source/serving/deploying_with_docker.rst | 7 +++---- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst index a07463392..40ba818eb 100644 --- a/docs/source/dev/dockerfile/dockerfile.rst +++ b/docs/source/dev/dockerfile/dockerfile.rst @@ -2,19 +2,19 @@ Dockerfile ==================== See `here `_ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `_. -- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: - - All build stages - - The default build target (highlighted in grey) - - External images (with dashed borders) +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) - The edges of the build graph represent: - - - FROM ... dependencies (with a solid line and a full arrow head) - - COPY --from=... dependencies (with a dashed line and an empty arrow head) - - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) +- COPY --from=... dependencies (with a dashed line and an empty arrow head) +- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) .. figure:: ../../assets/dev/dockerfile-stages-dependency.png :alt: query diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index fa82bc8e3..14d94b09e 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -3,9 +3,8 @@ Deploying with Docker ============================ -vLLM offers official docker image for deployment. -The image can be used to run OpenAI compatible server. -The image is available on Docker Hub as `vllm/vllm-openai `_. +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. .. code-block:: console @@ -25,7 +24,7 @@ The image is available on Docker Hub as `vllm/vllm-openai `_. To build vLLM: .. code-block:: console -- GitLab From 4a30d7e3ccae6e977d728e2157aaa11ac0fed549 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 19 Jun 2024 18:06:44 -0400 Subject: [PATCH 106/376] [Misc] Add per channel support for static activation quantization; update w8a8 schemes to share base classes (#5650) --- tests/quantization/test_compressed_tensors.py | 14 ++- .../compressed_tensors/compressed_tensors.py | 10 ++- .../schemes/compressed_tensors_w8a8.py | 84 +++++++++++++++++ .../compressed_tensors_w8a8_dynamictoken.py | 89 +++---------------- .../compressed_tensors_w8a8_statictensor.py | 60 +++---------- 5 files changed, 121 insertions(+), 136 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b78081155..aaa366335 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -13,8 +13,12 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso CompressedTensorsW8A8StaticTensor) -def test_compressed_tensors_w8a8_static_setup(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" +@pytest.mark.parametrize("model_args", [ + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"), + ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"), +]) +def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): + model_path, strategy = model_args with vllm_runner(model_path, enforce_eager=True) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -33,12 +37,14 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner): assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) + assert qkv_proj.scheme.strategy == strategy assert qkv_proj.weight.dtype is torch.int8 assert o_proj.weight.dtype is torch.int8 assert gate_up_proj.weight.dtype is torch.int8 - assert qkv_proj.weight_scale.shard_splitter is not None - assert qkv_proj.weight_scale.logical_widths is not None + if qkv_proj.scheme.strategy == "tensor": + assert qkv_proj.weight_scale.shard_splitter is not None + assert qkv_proj.weight_scale.logical_widths is not None assert qkv_proj.input_scale.dtype is torch.float32 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 347a052a6..44dd024af 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -85,8 +85,11 @@ class CompressedTensorsConfig(QuantizationConfig): def _is_static_tensor_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - is_tensor = (weight_quant.strategy == input_quant.strategy == - QuantizationStrategy.TENSOR.value) + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_tensor = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TENSOR.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_static = not weight_quant.dynamic and not input_quant.dynamic @@ -131,7 +134,8 @@ class CompressedTensorsConfig(QuantizationConfig): if self.quant_format == CompressionFormat.int_quantized.value: if self._is_static_tensor_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8StaticTensor() + return CompressedTensorsW8A8StaticTensor( + strategy=weight_quant.strategy) if self._is_dynamic_token_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8DynamicToken( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py new file mode 100644 index 000000000..efed79ec7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py @@ -0,0 +1,84 @@ +from typing import Callable, List, Tuple, Union + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) +from vllm.model_executor.utils import set_weight_attrs + + +class CompressedTensorsW8A8(CompressedTensorsScheme): + + def __init__(self, strategy: str): + self.strategy = strategy + + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + assert isinstance(shard_id, str) + qkv_idxs = {"q": 0, "k": 1, "v": 2} + assert shard_id in qkv_idxs + return qkv_idxs[shard_id] + + def scales_shard_splitter( + self, param: torch.Tensor, loaded_weight: torch.Tensor, + shard_id: Union[str, int], + logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shard_id = self._shard_id_as_int(shard_id) + offset = sum(logical_widths[:shard_id]) + size = logical_widths[shard_id] + # update loaded weight with copies for broadcast. + loaded_weight = loaded_weight.repeat(size) + return param[offset:offset + size], loaded_weight + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + is_tensor_partitioned = len(output_partition_sizes) != 1 + weight_scale_dim = sum(output_partition_sizes) if ( + is_tensor_partitioned + or self.strategy == QuantizationStrategy.CHANNEL) else 1 + + shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, ) + if self.strategy == QuantizationStrategy.CHANNEL: + shape = (weight_scale_dim, 1) + + weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32), + requires_grad=False) + + layer.register_parameter("weight_scale", weight_scale) + set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + requires_grad=False) + + layer.register_parameter("weight", weight) + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + "logical_widths": output_partition_sizes + }) + + # Don't need a shard_splitter for channel-wise quantization + # Use the default loading method + if self.strategy == QuantizationStrategy.CHANNEL: + set_weight_attrs(weight_scale, { + "output_dim": 0, + }) + else: + set_weight_attrs( + weight_scale, { + "logical_widths": output_partition_sizes, + "shard_splitter": self.scales_shard_splitter, + }) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 37610c9c2..5fc05b8e6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -1,42 +1,15 @@ -from typing import Callable, List, Tuple, Union +from typing import Callable, List import torch -from torch.nn import Parameter from vllm import _custom_ops as custom_ops -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import ( # noqa: E501 + CompressedTensorsW8A8) __all__ = ["CompressedTensorsW8A8DynamicToken"] -class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): - - def __init__(self, strategy: str): - self.strategy = strategy - - def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: - if isinstance(shard_id, int): - return shard_id - - assert isinstance(shard_id, str) - qkv_idxs = {"q": 0, "k": 1, "v": 2} - assert shard_id in qkv_idxs - return qkv_idxs[shard_id] - - def scales_shard_splitter( - self, param: torch.Tensor, loaded_weight: torch.Tensor, - shard_id: Union[str, int], - logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - shard_id = self._shard_id_as_int(shard_id) - offset = sum(logical_widths[:shard_id]) - size = logical_widths[shard_id] - # update loaded weight with copies for broadcast. - loaded_weight = loaded_weight.repeat(size) - return param[offset:offset + size], loaded_weight +class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8): def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -44,54 +17,12 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - # When the scales have a single value, it is required that they be - # on the CPU for performance and CUDA Graphs compatibility. Please - # refer to the comment in - # CompressedTensorsW8A8StaticTensor::create_weights for further - # information. - is_tensor_partitioned = len(output_partition_sizes) != 1 - # when doing channel-wise quantization, number of scales - # is equal to output_dim - weight_scale_dim = sum(output_partition_sizes) if ( - is_tensor_partitioned - or self.strategy == QuantizationStrategy.CHANNEL) else 1 - - shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, ) - if self.strategy == QuantizationStrategy.CHANNEL: - shape = (weight_scale_dim, 1) - - weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32), - requires_grad=False) - - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=torch.int8), - requires_grad=False) - - layer.register_parameter("weight", weight) - set_weight_attrs( - weight, { - "input_dim": 1, - "output_dim": 0, - "weight_loader": weight_loader, - "logical_widths": output_partition_sizes - }) - - layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - - # Don't need a shard_splitter for channel-wise quantization - # Use the default loading method - if self.strategy == QuantizationStrategy.CHANNEL: - set_weight_attrs(weight_scale, { - "output_dim": 0, - }) - else: - set_weight_attrs( - weight_scale, { - "logical_widths": output_partition_sizes, - "shard_splitter": self.scales_shard_splitter, - }) + super().create_weights( + layer=layer, + output_partition_sizes=output_partition_sizes, + input_size_per_partition=input_size_per_partition, + params_dtype=params_dtype, + weight_loader=weight_loader) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 414e17a06..79f5358a3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -1,37 +1,17 @@ -from typing import Callable, List, Tuple, Union +from typing import Callable, List import torch from torch.nn import Parameter from vllm import _custom_ops as custom_ops -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import ( # noqa: E501 + CompressedTensorsW8A8) from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8StaticTensor"] -class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): - - def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: - if isinstance(shard_id, int): - return shard_id - - assert isinstance(shard_id, str) - qkv_idxs = {"q": 0, "k": 1, "v": 2} - assert shard_id in qkv_idxs - return qkv_idxs[shard_id] - - def scales_shard_splitter( - self, param: torch.Tensor, loaded_weight: torch.Tensor, - shard_id: Union[str, int], - logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - shard_id = self._shard_id_as_int(shard_id) - offset = sum(logical_widths[:shard_id]) - size = logical_widths[shard_id] - # update loaded weight with copies for broadcast. - loaded_weight = loaded_weight.repeat(size) - return param[offset:offset + size], loaded_weight +class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8): def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -39,41 +19,21 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - is_tensor_partitioned = len(output_partition_sizes) != 1 - weight_scale_dim = sum( - output_partition_sizes) if is_tensor_partitioned else 1 + super().create_weights( + layer=layer, + output_partition_sizes=output_partition_sizes, + input_size_per_partition=input_size_per_partition, + params_dtype=params_dtype, + weight_loader=weight_loader) input_scale = Parameter(torch.empty(1, dtype=torch.float32), requires_grad=False) - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), - requires_grad=False) - - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=torch.int8), - requires_grad=False) - - layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "weight_loader": weight_loader, - "input_dim": 1, - "output_dim": 0, - }) layer.register_parameter("input_scale", input_scale) set_weight_attrs(input_scale, { "weight_loader": weight_loader, "ignore_warning": True, }) - layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs( - weight_scale, { - "weight_loader": weight_loader, - "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes, - "ignore_warning": True, - }) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight -- GitLab From 949e49a6857080e36ecd62f6e193754290c7c43c Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 19 Jun 2024 16:30:03 -0700 Subject: [PATCH 107/376] [ci] Limit num gpus if specified for A100 (#5694) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 1 + .buildkite/test-template-aws.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b1602dd94..95cd5b198 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -184,6 +184,7 @@ steps: - label: Distributed Tests (A100) gpu: a100 + num_gpus: 4 commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 08146bf44..fb34b787e 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -75,7 +75,7 @@ steps: - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" resources: limits: - nvidia.com/gpu: 8 + nvidia.com/gpu: {{ step.num_gpus or 1 }} volumeMounts: - name: devshm mountPath: /dev/shm -- GitLab From 3730a1c832bca5ca8128aec3c7659304895edf2e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 20 Jun 2024 10:09:21 +0800 Subject: [PATCH 108/376] [Misc] Improve conftest (#5681) --- tests/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index f37c9883f..5bbfd87f0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -365,7 +365,7 @@ class HfRunner: cleanup() -@pytest.fixture +@pytest.fixture(scope="session") def hf_runner(): return HfRunner @@ -385,6 +385,7 @@ class VllmRunner: block_size: int = 16, enable_chunked_prefill: bool = False, swap_space: int = 4, + enforce_eager: bool = False, **kwargs, ) -> None: self.model = LLM( @@ -393,6 +394,7 @@ class VllmRunner: trust_remote_code=True, dtype=dtype, swap_space=swap_space, + enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, tensor_parallel_size=tensor_parallel_size, max_model_len=max_model_len, -- GitLab From 1b2eaac3165dc78d4ef51231722735ca9cf37304 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 19 Jun 2024 23:10:47 -0700 Subject: [PATCH 109/376] [Bugfix][Doc] FIx Duplicate Explicit Target Name Errors (#5703) --- docs/source/dev/dockerfile/dockerfile.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst index 40ba818eb..9c17c27aa 100644 --- a/docs/source/dev/dockerfile/dockerfile.rst +++ b/docs/source/dev/dockerfile/dockerfile.rst @@ -1,8 +1,8 @@ Dockerfile ==================== -See `here `_ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `_. +See `here `__ for the main Dockerfile to construct +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: -- GitLab From 111af1fa2c4fdb2d83b466935a327b1a5009874a Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 20 Jun 2024 12:07:08 +0530 Subject: [PATCH 110/376] [Kernel] Update Cutlass int8 kernel configs for SM90 (#5514) Co-authored-by: Varun Sundar Rabindranath --- .../cutlass_w8a8/scaled_mm_c3x.cu | 165 +++++++++++++++--- 1 file changed, 143 insertions(+), 22 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index f1a2b73ff..8f2aa9425 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -234,15 +234,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, } template typename Epilogue, int32_t M> -struct sm90_fp8_config { + template typename Epilogue> +struct sm90_fp8_config_default { + // M in (128, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; using TileShape = Shape<_128, _128, _128>; using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = cutlass_3x_gemm; @@ -250,14 +250,14 @@ struct sm90_fp8_config { template typename Epilogue> -struct sm90_fp8_config { +struct sm90_fp8_config_M128 { + // M in (64, 128] static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; using TileShape = Shape<_64, _128, _128>; using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = cutlass_3x_gemm; @@ -265,7 +265,8 @@ struct sm90_fp8_config { template typename Epilogue> -struct sm90_fp8_config { +struct sm90_fp8_config_M64 { + // M in [1, 64] static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; @@ -278,6 +279,78 @@ struct sm90_fp8_config { KernelSchedule, EpilogueSchedule>; }; +template typename Epilogue> +struct sm90_int8_config_default { + // For M > 128 and any N + static_assert(std::is_same()); + using KernelSchedule = + typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_128, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M128 { + // For M in (64, 128] and any N + static_assert(std::is_same()); + using KernelSchedule = + typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M64 { + // For M in (32, 64] and any N + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M32_NBig { + // For M in [1, 32] and N >= 8192 + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _128, _256>; + using ClusterShape = Shape<_1, _4, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template typename Epilogue> +struct sm90_int8_config_M32_NSmall { + // For M in [1, 32] and N < 8192 + static_assert(std::is_same()); + using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _8, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + } // namespace template ::Cutlass3xGemm; + typename sm90_fp8_config_default::Cutlass3xGemm; using Cutlass3xGemmM64 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config_M64::Cutlass3xGemm; using Cutlass3xGemmM128 = - typename sm90_fp8_config::Cutlass3xGemm; + typename sm90_fp8_config_M128::Cutlass3xGemm; uint32_t const m = a.size(0); uint32_t const mp2 = @@ -316,6 +390,61 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a, } } +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass3xGemmDefault = + typename sm90_int8_config_default::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm90_int8_config_M128::Cutlass3xGemm; + using Cutlass3xGemmM64 = + typename sm90_int8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM32NBig = + typename sm90_int8_config_M32_NBig::Cutlass3xGemm; + using Cutlass3xGemmM32NSmall = + typename sm90_int8_config_M32_NSmall::Cutlass3xGemm; + + uint32_t const n = out.size(1); + bool const is_small_n = n < 8192; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(32), next_pow_2(m)); // next power of 2 + + if (mp2 <= 32) { + // m in [1, 32] + if (is_small_n) { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else if (mp2 <= 64) { + // m in (32, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -326,22 +455,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a, if (a.dtype() == torch::kInt8) { TORCH_CHECK(b.dtype() == torch::kInt8); - using TileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_1, _2, _1>; - using KernelSchedule = - typename cutlass::gemm::KernelTmaWarpSpecializedPingpong; - using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; - if (out.dtype() == torch::kBFloat16) { - return cutlass_gemm_caller>(out, a, b, a_scales, b_scales); + return cutlass_gemm_sm90_int8_dispatch( + out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - - return cutlass_gemm_caller< - cutlass_3x_gemm>( + return cutlass_gemm_sm90_int8_dispatch( out, a, b, a_scales, b_scales); } } else { -- GitLab From ad137cd1112ab9b17ac36fc123fc7806a1d7473d Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 20 Jun 2024 04:52:09 -0700 Subject: [PATCH 111/376] [Model] Port over CLIPVisionModel for VLMs (#5591) --- csrc/activation_kernels.cu | 12 ++ csrc/ops.h | 2 + csrc/torch_bindings.cpp | 4 + vllm/_custom_ops.py | 4 + vllm/model_executor/layers/activation.py | 16 ++ vllm/model_executor/models/clip.py | 203 +++++++++++++++++++++++ vllm/model_executor/models/llava.py | 17 +- vllm/model_executor/models/llava_next.py | 19 ++- vllm/model_executor/models/phi3v.py | 13 +- 9 files changed, 269 insertions(+), 21 deletions(-) create mode 100644 vllm/model_executor/models/clip.py diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 86ac2e75e..5ed1dc3b8 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) { return ((T)0.5) * x * (((T)1.0) + t); } +template +__device__ __forceinline__ T gelu_quick_kernel(const T& x) { + // x * sigmoid(1.702 * x) + return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x))); +} + } // namespace vllm void gelu_new(torch::Tensor& out, // [..., d] @@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); } + +void gelu_quick(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] +{ + LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel); +} diff --git a/csrc/ops.h b/csrc/ops.h index 9e2e977fa..ba92cc537 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -49,6 +49,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); +void gelu_quick(torch::Tensor& out, torch::Tensor& input); + #ifndef USE_ROCM torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 867bf4389..953f2eb4d 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCUDA, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index ab2a67950..a053a3aa2 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -66,6 +66,10 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: torch.ops._C.gelu_new(out, x) +def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + torch.ops._C.gelu_quick(out, x) + + # page attention ops def paged_attention_v1( out: torch.Tensor, diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index eb0606948..80cad15b4 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -141,6 +141,21 @@ class FastGELU(CustomOp): return out +class QuickGELU(CustomOp): + + # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return x * torch.sigmoid(1.702 * x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.gelu_quick(out, x) + return out + + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. @@ -189,6 +204,7 @@ _ACTIVATION_REGISTRY = { "gelu_new": NewGELU(), "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), "relu": nn.ReLU(), + "quick_gelu": QuickGELU(), } diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py new file mode 100644 index 000000000..aa4e87228 --- /dev/null +++ b/vllm/model_executor/models/clip.py @@ -0,0 +1,203 @@ +"""Minimal implementation of CLIPVisionModel intended to be only used +within a vision language model.""" +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from transformers import CLIPVisionConfig +from transformers.models.clip.modeling_clip import CLIPAttention + +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +def get_clip_num_patches(image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return (image_size // patch_size)**2 + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa +class CLIPVisionEmbeddings(nn.Module): + + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = get_clip_num_patches(self.image_size, + self.patch_size) + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, + self.embed_dim) + self.register_buffer("position_ids", + torch.arange(self.num_positions).expand((1, -1)), + persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + + return embeddings + + +class CLIPMLP(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class CLIPEncoderLayer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = CLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = CLIPMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class CLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`CLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + CLIPEncoderLayer(config=config, quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + + def forward(self, + inputs_embeds: torch.Tensor, + vision_feature_layer: int = -1): + + # Encoder forward pass only up to the required layer + num_layer = len(self.layers) + vision_feature_layer + 1 + hidden_states = inputs_embeds + for encoder_layer in self.layers[:num_layer]: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class CLIPVisionTransformer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLIPVisionEmbeddings(config) + + # NOTE: This typo of "layrnorm" is not fixed on purpose to match + # the original transformers code and name of the model weights. + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = CLIPEncoder(config=config, quant_config=quant_config) + + def forward( + self, + pixel_values: torch.Tensor, + vision_feature_layer: int = -1, + ) -> torch.Tensor: + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + hidden_states = self.encoder(inputs_embeds=hidden_states, + vision_feature_layer=vision_feature_layer) + + return hidden_states + + +class CLIPVisionModel(nn.Module): + + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.vision_model = CLIPVisionTransformer(config=config, + quant_config=quant_config) + + def forward(self, + pixel_values: Optional[torch.Tensor] = None, + vision_feature_layer: int = -1): + + return self.vision_model(pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer) + + @property + def device(self): + return next(self.parameters()).device diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 39355b9d3..8e36c54b1 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,9 +2,7 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch import torch.nn as nn -# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on -# transformers' impl. -from transformers import CLIPVisionModel, LlavaConfig +from transformers import LlavaConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig @@ -15,6 +13,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -189,12 +188,11 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: - # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. - image_outputs = vision_tower(pixel_values.to(vision_tower.device), - output_hidden_states=True) - image_features = image_outputs.hidden_states[ - self.config.vision_feature_layer] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_tower(pixel_values.to(vision_tower.device), + self.config.vision_feature_layer) return self._select_image_features( image_features, @@ -317,6 +315,9 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 0ab9afea9..c1158c933 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -4,9 +4,7 @@ from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, import torch import torch.nn as nn from PIL import Image -# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on -# transformers' impl. -from transformers import CLIPVisionModel, LlavaNextConfig +from transformers import LlavaNextConfig from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired @@ -20,6 +18,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData @@ -121,7 +120,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase): if self.vision_language_config.image_input_type == ( VisionLanguageConfig.ImageInputType.PIXEL_VALUES): - self.vision_tower = CLIPVisionModel(config.vision_config) + self.vision_tower = CLIPVisionModel(config=config.vision_config) else: raise TypeError("Image features are not supported by LLaVA-NeXT") @@ -219,12 +218,11 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase): def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: - # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. - image_outputs = vision_tower(pixel_values.to(vision_tower.device), - output_hidden_states=True) - image_features = image_outputs.hidden_states[ - self.config.vision_feature_layer] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_tower(pixel_values.to(vision_tower.device), + self.config.vision_feature_layer) return self._select_image_features( image_features, @@ -430,6 +428,9 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 35f3b894f..fa20a7c59 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -17,7 +17,7 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn -from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig +from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -70,9 +71,10 @@ class Phi3ImageEmbeddingBase(nn.Module): LAYER_IDX = self.layer_idx TYPE_FEATURE = self.type_feature - img_processor_output = self.img_processor(img_embeds, - output_hidden_states=True) - img_feature = img_processor_output.hidden_states[LAYER_IDX] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the img_processor + img_feature = self.img_processor(img_embeds, + vision_feature_layer=LAYER_IDX) if TYPE_FEATURE == "patch": patch_feature = img_feature[:, 1:] @@ -352,6 +354,9 @@ class Phi3VForCausalLM(VisionLanguageModelBase): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) -- GitLab From a7dcc62086ea751b46b4821c2811cf8ac83711bf Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 20 Jun 2024 19:03:21 +0530 Subject: [PATCH 112/376] [Kernel] Update Cutlass int8 kernel configs for SM80 (#5275) Co-authored-by: Varun Sundar Rabindranath --- csrc/quantization/cutlass_w8a8/common.hpp | 7 + .../cutlass_w8a8/scaled_mm_c2x.cu | 127 ++++++++++++++++-- .../cutlass_w8a8/scaled_mm_c3x.cu | 5 - 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 999b7b251..23d0587bb 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -1,6 +1,7 @@ #pragma once #include "cutlass/cutlass.h" +#include /** * Helper function for checking CUTLASS errors @@ -10,3 +11,9 @@ TORCH_CHECK(status == cutlass::Status::kSuccess, \ cutlassGetStatusString(status)) \ } + +inline uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 7651268dc..740b9fb64 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -250,8 +250,120 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(status); } +template typename Epilogue> +struct sm80_config_default { + // This config is used in 2 cases, + // - M in (128, inf) + // - M in (64, 128] and N >= 8192 + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M64 { + // This config is used in 2 cases, + // - M in (32, 64] + // - M in (64, 128] and N < 8192 + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M32 { + // M in (16, 32] + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M16 { + // M in [1, 16] + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + } // namespace +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass2xGemmDefault = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128BigN = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128SmallN = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM64 = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM32 = + typename sm80_config_M32::Cutlass2xGemm; + using Cutlass2xGemmM16 = + typename sm80_config_M16::Cutlass2xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(16), next_pow_2(m)); // next power of 2 + if (mp2 <= 16) { + // M in [1, 16] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 32) { + // M in (16, 32] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 64) { + // M in (32, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // M in (64, 128] + uint32_t const n = out.size(1); + bool const small_n = n < 8192; + if (small_n) { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else { + // M in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -288,20 +400,13 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; - using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - if (out.dtype() == torch::kBFloat16) { - return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + return cutlass_gemm_sm80_dispatch(out, a, b, a_scales, + b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return cutlass_gemm_caller>( + return cutlass_gemm_sm80_dispatch( out, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 8f2aa9425..cfa8f80f7 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -44,11 +44,6 @@ using namespace cute; namespace { -uint32_t next_pow_2(uint32_t const num) { - if (num <= 1) return num; - return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); -} - // A wrapper for the GEMM kernel that is used to guard against compilation on // architectures that will never use the kernel. The purpose of this is to // reduce the size of the compiled binary. -- GitLab From 3f3b6b21500bce2061cae33706bd47c8b6663771 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 20 Jun 2024 14:36:10 -0400 Subject: [PATCH 113/376] [Bugfix] Fix the CUDA version check for FP8 support in the CUTLASS kernels (#5715) --- csrc/ops.h | 2 ++ .../quantization/cutlass_w8a8/scaled_mm_entry.cu | 16 ++++++++++++++++ csrc/torch_bindings.cpp | 6 ++++++ vllm/_custom_ops.py | 4 ++++ vllm/model_executor/layers/quantization/fp8.py | 15 ++------------- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index ba92cc537..6f0a7143c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,8 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits); +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); + void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 687f8efd8..f4e582d78 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -25,6 +25,22 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b_scales); #endif +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { + // CUTLASS FP8 kernels need at least + // CUDA 12.0 on SM90 systems (Hopper) + // CUDA 12.4 on SM89 systems (Lovelace) + +#if defined CUDA_VERSION + if (cuda_device_capability >= 90) { + return CUDA_VERSION >= 12000; + } else if (cuda_device_capability >= 89) { + return CUDA_VERSION >= 12040; + } +#endif + + return false; +} + void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 953f2eb4d..227b69d79 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -144,6 +144,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor b, Tensor a_scales," " Tensor b_scales) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); + + // Check if cutlass scaled_mm is supported for CUDA devices of the given + // capability + ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); + ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA, + &cutlass_scaled_mm_supports_fp8); #endif // Quantized GEMM for GPTQ. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index a053a3aa2..e050c1172 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -216,6 +216,10 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # cutlass +def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability) + + def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: Type[torch.dtype]) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e89fd6581..bbf3cde54 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -20,19 +20,8 @@ logger = init_logger(__name__) def cutlass_fp8_supported() -> bool: capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - major, minor = torch.version.cuda.split(".") - version = int(major) * 10 + int(minor) - - # CUTLASS FP8 kernels need at least - # CUDA 12.0 on SM90 systems (Hopper) - # CUDA 12.4 on SM89 systems (Lovelace) - gpu_is_supported = False - if capability >= 90: - gpu_is_supported = version > 120 - elif capability >= 89: - gpu_is_supported = version > 124 - - return gpu_is_supported + + return ops.cutlass_scaled_mm_supports_fp8(capability) class Fp8Config(QuantizationConfig): -- GitLab From 8065a7e220cca1dd53107da85b6f3932ac9e25e8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 20 Jun 2024 19:00:13 -0400 Subject: [PATCH 114/376] [Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) --- benchmarks/benchmark_latency.py | 3 ++- benchmarks/benchmark_prefix_caching.py | 4 ++-- benchmarks/benchmark_serving.py | 7 ++++++- benchmarks/benchmark_throughput.py | 3 ++- .../cutlass_benchmarks/w8a8_benchmarks.py | 3 ++- benchmarks/kernels/benchmark_aqlm.py | 4 ++-- benchmarks/kernels/benchmark_marlin.py | 4 ++-- benchmarks/kernels/benchmark_moe.py | 3 ++- .../kernels/benchmark_paged_attention.py | 6 +++--- benchmarks/kernels/benchmark_rope.py | 4 ++-- benchmarks/overheads/benchmark_hashing.py | 4 ++-- examples/aqlm_example.py | 5 ++--- examples/llm_engine_example.py | 3 ++- examples/save_sharded_state.py | 4 ++-- examples/tensorize_vllm_model.py | 3 ++- tests/async_engine/api_server_async_engine.py | 4 ++-- vllm/engine/arg_utils.py | 17 ++++++++--------- vllm/entrypoints/api_server.py | 5 ++--- vllm/entrypoints/openai/cli_args.py | 3 ++- vllm/entrypoints/openai/run_batch.py | 5 ++--- .../model_executor/model_loader/tensorizer.py | 4 ++-- vllm/utils.py | 19 +++++++++++++++++++ 22 files changed, 72 insertions(+), 45 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e9d1048c8..a4cf0632b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -13,6 +13,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): @@ -120,7 +121,7 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 089966986..395107a5e 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ -import argparse import time from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 @@ -44,7 +44,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance with or without automatic ' 'prefix caching.') parser.add_argument('--model', diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index eef03e7d8..42867fc40 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -44,6 +44,11 @@ try: except ImportError: from backend_request_func import get_tokenizer +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + @dataclass class BenchmarkMetrics: @@ -511,7 +516,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the online serving throughput.") parser.add_argument( "--backend", diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ed65002bc..2c6beb4e8 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -12,6 +12,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -261,7 +262,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 5cc0fbbd4..377f8683c 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,6 +11,7 @@ from torch.utils.benchmark import Measurement as TMeasurement from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -293,7 +294,7 @@ if __name__ == '__main__': return torch.float8_e4m3fn raise ValueError("unsupported dtype") - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description=""" Benchmark Cutlass GEMM. diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index ac6a9f297..601c4ea43 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,3 @@ -import argparse import os import sys from typing import Optional @@ -10,6 +9,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser os.environ['CUDA_VISIBLE_DEVICES'] = '0' @@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: def main(): - parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments parser.add_argument("--nbooks", diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 96f01967b..261f58296 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,3 @@ -import argparse from typing import List import torch @@ -16,6 +15,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -211,7 +211,7 @@ def main(args): # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark Marlin across specified models/shapes/batches") parser.add_argument( "--models", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 62347aaf8..e00696d6d 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -10,6 +10,7 @@ from ray.experimental.tqdm_ray import tqdm from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -315,7 +316,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 687e2369b..16de60477 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,3 @@ -import argparse import random import time from typing import List, Optional @@ -6,7 +5,8 @@ from typing import List, Optional import torch from vllm import _custom_ops as ops -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -161,7 +161,7 @@ def main( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", type=str, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index a53c6c77a..78736c7a7 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,3 @@ -import argparse from itertools import accumulate from typing import List, Optional @@ -7,6 +6,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -86,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the rotary embedding kernels.") parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index c846e47de..203699e9a 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,8 +1,8 @@ -import argparse import cProfile import pstats from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" @@ -47,7 +47,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance of hashing function in' 'automatic prefix caching.') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 3a63003ab..40f9a21ec 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,11 +1,10 @@ -import argparse - from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser def main(): - parser = argparse.ArgumentParser(description='AQLM examples') + parser = FlexibleArgumentParser(description='AQLM examples') parser.add_argument('--model', '-m', diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index a81c4b3e3..ca41f32b1 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -2,6 +2,7 @@ import argparse from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.utils import FlexibleArgumentParser def create_test_prompts() -> List[Tuple[str, SamplingParams]]: @@ -55,7 +56,7 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Demo on using the LLMEngine class directly') parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py index c595d98ba..4207f8922 100644 --- a/examples/save_sharded_state.py +++ b/examples/save_sharded_state.py @@ -20,15 +20,15 @@ llm = LLM( tensor_parallel_size=8, ) """ -import argparse import dataclasses import os import shutil from pathlib import Path from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser -parser = argparse.ArgumentParser() +parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) parser.add_argument("--output", "-o", diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index f9ed5fe08..dd77a4ad0 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -9,6 +9,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, TensorizerConfig, tensorize_vllm_model) +from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring # yapf: disable @@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="An example script that can be used to serialize and " "deserialize vLLM models. These models " "can be loaded using tensorizer directly to the GPU " diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1be76fdc8..495a123c3 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,5 +1,4 @@ """vllm.entrypoints.api_server with some extra logging for testing.""" -import argparse from typing import Any, Dict import uvicorn @@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.utils import FlexibleArgumentParser app = vllm.entrypoints.api_server.app @@ -33,7 +33,7 @@ def stats() -> Response: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser = AsyncEngineArgs.add_cli_args(parser) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7f760c277..ef3161242 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -11,7 +11,7 @@ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import str_to_int_tuple +from vllm.utils import FlexibleArgumentParser, str_to_int_tuple def nullable_str(val: str): @@ -110,7 +110,7 @@ class EngineArgs: @staticmethod def add_cli_args_for_vlm( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--image-input-type', type=nullable_str, default=None, @@ -156,8 +156,7 @@ class EngineArgs: return parser @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Shared CLI arguments for vLLM engine.""" # Model arguments @@ -800,8 +799,8 @@ class AsyncEngineArgs(EngineArgs): max_log_len: Optional[int] = None @staticmethod - def add_cli_args(parser: argparse.ArgumentParser, - async_args_only: bool = False) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser, + async_args_only: bool = False) -> FlexibleArgumentParser: if not async_args_only: parser = EngineArgs.add_cli_args(parser) parser.add_argument('--engine-use-ray', @@ -822,13 +821,13 @@ class AsyncEngineArgs(EngineArgs): # These functions are used by sphinx to build the documentation def _engine_args_parser(): - return EngineArgs.add_cli_args(argparse.ArgumentParser()) + return EngineArgs.add_cli_args(FlexibleArgumentParser()) def _async_engine_args_parser(): - return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(), + return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(), async_args_only=True) def _vlm_engine_args_parser(): - return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser()) + return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser()) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 075de0b4e..feb904c5a 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -6,7 +6,6 @@ We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. """ -import argparse import json import ssl from typing import AsyncGenerator @@ -19,7 +18,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds. app = FastAPI() @@ -80,7 +79,7 @@ async def generate(request: Request) -> Response: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4c0cb1e4f..59ad73bf0 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -10,6 +10,7 @@ import ssl from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.utils import FlexibleArgumentParser class LoRAParserAction(argparse.Action): @@ -23,7 +24,7 @@ class LoRAParserAction(argparse.Action): def make_arg_parser(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=nullable_str, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 2f1870187..488ac8971 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,4 +1,3 @@ -import argparse import asyncio import sys from io import StringIO @@ -16,14 +15,14 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput, from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible batch runner.") parser.add_argument( "-i", diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index d79fedaea..b009ad8c8 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.utils import FlexibleArgumentParser tensorizer_error_msg = None @@ -177,8 +178,7 @@ class TensorizerArgs: self.deserializer_params['encryption'] = decryption_params @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Tensorizer CLI arguments""" # Tensorizer options arg group diff --git a/vllm/utils.py b/vllm/utils.py index ffe921e65..27a7b1042 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,3 +1,4 @@ +import argparse import asyncio import datetime import enum @@ -775,3 +776,21 @@ def run_once(f): wrapper.has_run = False # type: ignore[attr-defined] return wrapper + + +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + processed_args.append('--' + arg[len('--'):].replace('_', '-')) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace) -- GitLab From 6c5b7af1525a2013d7b1806dd6c0c9a53404be6d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 20 Jun 2024 17:06:34 -0700 Subject: [PATCH 115/376] [distributed][misc] use fork by default for mp (#5669) --- .buildkite/test-pipeline.yaml | 9 ++++++ .../custom_all_reduce_utils.py | 28 ++++++++++++++++++- vllm/envs.py | 4 +-- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 95cd5b198..5e92ba3c2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -37,6 +37,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py @@ -55,6 +58,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s distributed/test_pynccl.py # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. @@ -145,6 +151,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index e0641a54c..d3e41fa71 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,6 +1,9 @@ import ctypes import json import os +import pickle +import subprocess +import sys from itertools import product from typing import Dict, List, Optional, Sequence @@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) - result = can_actually_p2p(batch_src, batch_tgt) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. + input_bytes = pickle.dumps((batch_src, batch_tgt)) + returned = subprocess.run([sys.executable, __file__], + input=input_bytes, + capture_output=True) + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}") from e + result = pickle.loads(returned.stdout) for _i, _j, r in zip(batch_src, batch_tgt, result): cache[f"{_i}->{_j}"] = r with open(path, "w") as f: @@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: __all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + sys.stdout.buffer.write(pickle.dumps(result)) diff --git a/vllm/envs.py b/vllm/envs.py index f03b69f4b..ae2fcd082 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,7 +29,7 @@ if TYPE_CHECKING: VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" + VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), + lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), # Timeout for fetching images when serving multimodal models # Default is 5 seconds -- GitLab From b12518d3cf4326dfcd10a09780913b86c19fcf1a Mon Sep 17 00:00:00 2001 From: Joshua Rosenkranz Date: Thu, 20 Jun 2024 20:23:12 -0400 Subject: [PATCH 116/376] [Model] MLPSpeculator speculative decoding support (#4947) Signed-off-by: Thomas Parnell Co-authored-by: Thomas Parnell Co-authored-by: Nick Hill Co-authored-by: Davis Wertheimer --- examples/offline_inference_mlpspeculator.py | 59 ++++++++ tests/spec_decode/test_spec_decode_worker.py | 8 +- tests/spec_decode/test_utils.py | 4 +- vllm/config.py | 54 +++++-- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/mlp_speculator.py | 143 ++++++++++++++++++ vllm/sequence.py | 46 ++++++ vllm/spec_decode/batch_expansion.py | 6 +- vllm/spec_decode/interfaces.py | 4 + vllm/spec_decode/mlp_speculator_worker.py | 87 +++++++++++ vllm/spec_decode/spec_decode_worker.py | 42 ++++- vllm/spec_decode/top1_proposer.py | 4 + vllm/spec_decode/util.py | 8 - vllm/transformers_utils/config.py | 18 ++- vllm/transformers_utils/configs/__init__.py | 2 + .../configs/mlp_speculator.py | 50 ++++++ vllm/worker/model_runner.py | 18 ++- vllm/worker/worker.py | 9 ++ 18 files changed, 523 insertions(+), 40 deletions(-) create mode 100644 examples/offline_inference_mlpspeculator.py create mode 100644 vllm/model_executor/models/mlp_speculator.py create mode 100644 vllm/spec_decode/mlp_speculator_worker.py create mode 100644 vllm/transformers_utils/configs/mlp_speculator.py diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py new file mode 100644 index 000000000..5448ec1f6 --- /dev/null +++ b/examples/offline_inference_mlpspeculator.py @@ -0,0 +1,59 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs])) + # Print the outputs. + for output in outputs: + generated_text = output.outputs[0].text + print(f"text: {generated_text!r}") + + +if __name__ == "__main__": + + template = ( + "Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:\n{}" + "\n\n### Response:\n") + + # Sample prompts. + prompts = [ + "Write about the president of the United States.", + ] + prompts = [template.format(prompt) for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=200) + + # Create an LLM without spec decoding + llm = LLM(model="meta-llama/Llama-2-13b-chat-hf") + + print("Without speculation") + time_generation(llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + llm = LLM( + model="meta-llama/Llama-2-13b-chat-hf", + speculative_model="ibm-fms/llama-13b-accelerator", + # These are currently required for MLPSpeculator decoding + use_v2_block_manager=True, + enforce_eager=True, + ) + + print("With speculation") + time_generation(llm, prompts, sampling_params) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index afaeffc96..a20c793c9 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -456,7 +456,9 @@ def test_k_equals_zero(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + sampler_output = MagicMock(spec=SamplerOutput) + sampler_output.hidden_states = None + target_worker.execute_model.return_value = [sampler_output] draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -497,7 +499,9 @@ def test_empty_input_batch(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + sampler_output = MagicMock(spec=SamplerOutput) + sampler_output.hidden_states = None + target_worker.execute_model.return_value = [sampler_output] draft_worker.device = 'cuda' target_worker.device = 'cuda' diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 6b6f35a1a..bccbf9a6a 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -2,8 +2,8 @@ from unittest.mock import MagicMock import pytest -from vllm.sequence import SequenceGroupMetadata -from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len +from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids +from vllm.spec_decode.util import split_batch_by_proposal_len def test_get_all_seq_ids(): diff --git a/vllm/config.py b/vllm/config.py index 5de00d7d3..8d004902f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -230,7 +230,8 @@ class ModelConfig: self, parallel_config: "ParallelConfig", ) -> None: - total_num_attention_heads = self.hf_text_config.num_attention_heads + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) tensor_parallel_size = parallel_config.tensor_parallel_size if total_num_attention_heads % tensor_parallel_size != 0: raise ValueError( @@ -238,7 +239,8 @@ class ModelConfig: " must be divisible by tensor parallel size " f"({tensor_parallel_size}).") - total_num_hidden_layers = self.hf_text_config.num_hidden_layers + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) pipeline_parallel_size = parallel_config.pipeline_parallel_size if total_num_hidden_layers % pipeline_parallel_size != 0: raise ValueError( @@ -341,8 +343,8 @@ class ModelConfig: def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: - return self.hf_text_config.num_attention_heads // \ - parallel_config.tensor_parallel_size + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_text_config.num_hidden_layers @@ -818,7 +820,8 @@ class SpeculativeConfig: speculative_model (Optional[str]): The name of the speculative model, if provided. num_speculative_tokens (Optional[int]): The number of speculative - tokens, if provided. + tokens, if provided. Will default to the number in the draft + model config if present, otherwise is required. speculative_max_model_len (Optional[int]): The maximum model len of the speculative model. Used when testing the ability to skip speculation for some sequences. @@ -841,24 +844,18 @@ class SpeculativeConfig: the necessary conditions are met, else None. """ - if speculative_model is None and num_speculative_tokens is None: + if speculative_model is None: + if num_speculative_tokens is not None: + raise ValueError("num_speculative_tokens was provided without " + "speculative_model.") return None - if speculative_model is not None and num_speculative_tokens is None: - raise ValueError( - "Expected both speculative_model and " - "num_speculative_tokens to be provided, but found " - f"{speculative_model=} and {num_speculative_tokens=}.") - if (speculative_disable_by_batch_size is not None and speculative_disable_by_batch_size < 2): raise ValueError("Expect the batch size threshold of disabling " "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - assert (speculative_model is not None - and num_speculative_tokens is not None) - if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " @@ -912,6 +909,27 @@ class SpeculativeConfig: max_logprobs=target_model_config.max_logprobs, ) + if (draft_model_config.hf_config.model_type == "mlp_speculator" + and target_parallel_config.world_size != 1): + # MLPSpeculator TP support will be added very soon + raise ValueError( + "Speculative decoding with mlp_speculator models does not " + "yet support distributed inferencing (TP > 1).") + + n_predict = getattr(draft_model_config.hf_config, "n_predict", + None) + if n_predict is not None: + if num_speculative_tokens is None: + # Default to max value defined in draft model config. + num_speculative_tokens = n_predict + elif num_speculative_tokens > n_predict: + # Verify provided value doesn't exceed the maximum + # supported by the draft model. + raise ValueError( + "Expected both speculative_model and " + "num_speculative_tokens to be provided, but found " + f"{speculative_model=} and {num_speculative_tokens=}.") + draft_model_config.max_model_len = ( SpeculativeConfig._maybe_override_draft_max_model_len( speculative_max_model_len, @@ -923,6 +941,12 @@ class SpeculativeConfig: SpeculativeConfig.create_draft_parallel_config( target_parallel_config)) + if num_speculative_tokens is None: + raise ValueError( + "num_speculative_tokens must be provided with " + "speculative_model unless the draft model config contains an " + "n_predict parameter.") + return SpeculativeConfig( draft_model_config, draft_parallel_config, diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index f9ec72096..5afb2e1d4 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -60,6 +60,7 @@ _GENERATION_MODELS = { "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), "XverseForCausalLM": ("xverse", "XverseForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), + "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } _EMBEDDING_MODELS = { diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py new file mode 100644 index 000000000..b18269777 --- /dev/null +++ b/vllm/model_executor/models/mlp_speculator.py @@ -0,0 +1,143 @@ +import math +from typing import Iterable, List, Tuple + +import torch +import torch.nn as nn + +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.sequence import SamplerOutput + + +class MLPSpeculatorLayerNorm(nn.Module): + """ + A L2 normalization implementation + ... + Args + ---- + normalized_shape : int + Dimensionality of input data (size of final tensor axis) + eps : float + Safety term to prevent division by zero. Make sure the chosen value + fits in the range of your encoding scheme + (i.e. fp16 requires eps >= 6e-8). + """ + + def __init__( + self, + normalized_shape, + eps=1e-06, + ): + super(MLPSpeculatorLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.empty(normalized_shape)) + self.bias = nn.Parameter(torch.empty(normalized_shape)) + self.eps = eps + + def forward(self, x): + xf = x + xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps) + x = xf.type_as(x) + x = self.weight * x + x = x + self.bias + return x + + +class MLPSpeculator(nn.Module): + + def __init__(self, config, **kwargs) -> None: + super().__init__() + self.n_predict = config.n_predict + self.vocab_size = config.vocab_size + self.emb_dim = config.emb_dim + self.inner_dim = config.inner_dim if config.inner_dim != 0 \ + else config.emb_dim + + self.max_speculative_tokens = getattr(config, "max_speculative_tokens", + self.n_predict) + + self.emb = nn.ModuleList([ + VocabParallelEmbedding(config.vocab_size, + self.inner_dim, + org_num_embeddings=config.vocab_size) + for _ in range(self.max_speculative_tokens) + ]) + + self.proj = nn.ModuleList([ + nn.Linear((self.emb_dim if i == 0 else self.inner_dim), + self.inner_dim, + bias=False) for i in range(self.max_speculative_tokens) + ]) + + self.head = nn.ModuleList([ + nn.Linear(self.inner_dim, self.vocab_size, bias=False) + for _ in range(self.max_speculative_tokens) + ]) + self.ln = nn.ModuleList([ + MLPSpeculatorLayerNorm(self.inner_dim) + for _ in range(self.max_speculative_tokens) + ]) + + self.state_weight = 0.5**(0.5 / config.n_predict) + self.emb_weight = math.sqrt( + (1 - self.state_weight**2) * (self.inner_dim / 2)) + self.activation = nn.GELU() + self.config = config + self.logits_processor = LogitsProcessor(config.vocab_size, + config.vocab_size, 1.0) + self.sampler = Sampler() + + def generate_proposals( + self, + input_ids: torch.Tensor, + previous_hidden_states: torch.Tensor, + num_predict_tokens: int, + sampling_metadata: SamplingMetadata, + ) -> List[SamplerOutput]: + if num_predict_tokens > self.max_speculative_tokens: + raise ValueError(f"Max speculative tokens for model is " + f"{self.max_speculative_tokens}, but " + f"{num_predict_tokens} were requested") + + # b x 1 x d + previous_hidden_states = previous_hidden_states.unsqueeze(1) + + # b x 1 + last_tokens = input_ids.unsqueeze(1) + + next_tokens = [] + + for head_index in range(num_predict_tokens): + + # Project and predict + z = self.emb[head_index](last_tokens) # b k d + states = self.proj[head_index](previous_hidden_states) + + # Weighted add of state_weight*state and emb_weight*z + # Let subsequent LN take care of denominator + # state_weight is close to 1, so shouldn't be any precision issues + states.add_(z, alpha=self.emb_weight / self.state_weight) + + states = self.activation(self.ln[head_index](states)) # b k d + # TODO: not yet supporting top_k_tokens_per_head + previous_hidden_states = states + + logits = self.logits_processor(self.head[head_index].weight, + states, sampling_metadata) + + output = self.sampler(logits.flatten(0, 1), sampling_metadata) + last_tokens = output.sampled_token_ids + next_tokens.append(output) + + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + param = params_dict[name.replace("speculator.", "")] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/sequence.py b/vllm/sequence.py index 38d3349f2..287e1b9df 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -794,6 +794,9 @@ class SamplerOutput: # Spec decode metrics populated by workers. spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + # Optional last hidden states from the model. + hidden_states: Optional[torch.Tensor] = None + def __getitem__(self, idx: int): return self.outputs[idx] @@ -842,6 +845,46 @@ class PoolerOutput: self.__class__) and self.outputs == other.outputs +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] + + +class HiddenStates: + """Hidden states corresponding to in-progress sequences. + Used in speculative decoding to pass hidden states from + the target model to the proposer model in the subsequent step. + + seq_ids are the sequence ids of each entry of the batch + dimension of the hidden_states tensor""" + + def __init__(self, seq_group_metadata_list: List[SequenceGroupMetadata], + hidden_states: torch.Tensor): + assert len(seq_group_metadata_list) == len(hidden_states) + self.seq_ids: List[int] = get_all_seq_ids(seq_group_metadata_list) + self.hidden_states: torch.Tensor = hidden_states + + def update(self, seq_group_metadata_list: List[SequenceGroupMetadata], + hidden_states: torch.Tensor) -> None: + """Update hidden states from target model invocation.""" + assert len(seq_group_metadata_list) == len(hidden_states) + self.seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) + self.hidden_states = torch.cat([self.hidden_states, hidden_states]) + + def prune(self, + seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + """Prune to provided list of sequence ids.""" + seq_ids = get_all_seq_ids(seq_group_metadata_list) + if seq_ids != self.seq_ids: + # Batch contents changed - prune removed sequences. + index = [self.seq_ids.index(seq_id) for seq_id in seq_ids] + self.hidden_states = self.hidden_states[index] + self.seq_ids = seq_ids + + @dataclass class ExecuteModelRequest: """The model execution request.""" @@ -857,6 +900,8 @@ class ExecuteModelRequest: num_lookahead_slots: int = 0 # The number of requests in the running queue. running_queue_size: int = 0 + # Optional hidden states from prior step. + previous_hidden_states: Optional[HiddenStates] = None def clone( self, seq_group_metadata_list: List[SequenceGroupMetadata] @@ -869,4 +914,5 @@ class ExecuteModelRequest: blocks_to_copy=self.blocks_to_copy.copy(), num_lookahead_slots=self.num_lookahead_slots, running_queue_size=self.running_queue_size, + previous_hidden_states=self.previous_hidden_states, ) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 1bde04208..405165563 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -4,11 +4,10 @@ from typing import Iterator, List, Tuple import torch from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, - SequenceGroupMetadata) + SequenceGroupMetadata, get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, - sampler_output_to_torch, +from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase @@ -98,6 +97,7 @@ class BatchExpansionTop1Scorer(SpeculativeScorer): probs=all_probs, token_ids=all_tokens, logprobs=spec_logprobs, + hidden_states=target_sampler_output.hidden_states, ) def _expand_batch( diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 72d7818eb..d236fc0f2 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from typing import Optional import torch @@ -46,6 +47,9 @@ class SpeculativeScores: # tokens and also non-speculative normal decoding. token_ids: torch.Tensor + # Optional last hidden states from the scoring model. + hidden_states: Optional[torch.Tensor] = None + def __repr__(self): return (f"SpeculativeScores(" f"probs={self.probs.shape}, " diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py new file mode 100644 index 000000000..0926e13be --- /dev/null +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -0,0 +1,87 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.model_executor import SamplingMetadata +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, + SequenceGroupMetadata) +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase +from vllm.worker.model_runner import ModelInput + + +class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): + """Worker for MLPSpeculator models. + + Not currently compatible with LoRA or chunked prefill. + """ + + @torch.inference_mode() + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + """Run the model forward pass to generate sample_len future tokens. + Returns the list of sampler output, one per layer, along with indicator + of whether torch tensor in sampler output need to be transposed in + latter sampler_output_to_torch logic. + + For mlp spec worker, this indicator shall be True. + """ + self._raise_if_unsupported(execute_model_req) + + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + + (input_tokens, seq_lens, + query_lens) = self._prepare_input_tensors(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, seq_lens, query_lens, self.device, + self.model_runner.pin_memory) + + model_outputs = self.model_runner.model.generate_proposals( + input_ids=input_tokens, + previous_hidden_states=execute_model_req.previous_hidden_states. + hidden_states, + num_predict_tokens=sample_len, + sampling_metadata=sampling_metadata) + + assert len(model_outputs) == sample_len + + return model_outputs, True + + def _prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, List[int], List[int]]: + if not seq_group_metadata_list: + return ModelInput.empty(self.device) + + input_tokens: List[int] = [] + seq_lens: List[int] = [] + query_lens: List[int] = [] + + for seq_group_metadata in seq_group_metadata_list: + is_prompt = seq_group_metadata.is_prompt + + for seq_data in seq_group_metadata.seq_data.values(): + seq_data_len = seq_data.get_len() + if is_prompt: + context_len = seq_data.get_num_computed_tokens() + seq_len = min( + seq_data_len, + context_len + seq_group_metadata.token_chunk_size) + tokens = seq_data.get_token_ids()[context_len:seq_len] + seq_lens.append(seq_len) + input_tokens.extend(tokens) + query_lens.append(seq_len - context_len) + else: + seq_lens.append(seq_data_len) + input_tokens.append(seq_data.get_last_token_id()) + query_lens.append(1) + + input_tokens_tensor = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + return input_tokens_tensor, seq_lens, query_lens diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 03fad5663..58d3461a2 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -8,16 +8,18 @@ from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, - SamplerOutput, SequenceGroupMetadata) + HiddenStates, SamplerOutput, SequenceGroupMetadata, + get_all_seq_ids) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.util import (create_sequence_group_output, - get_all_num_logprobs, get_all_seq_ids, + get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker @@ -104,6 +106,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): proposer_worker = NGramWorker(**draft_worker_kwargs) proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) + elif draft_worker_kwargs[ + "model_config"].hf_config.model_type == "mlp_speculator": + proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) + disable_bonus_tokens = False else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) @@ -155,6 +161,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): # Lazy initiazliation. self.scorer: SpeculativeScorer + # Hidden states from target model to pass to proposer + # in the subsequent step. + self.previous_hidden_states: Optional[HiddenStates] = None + def init_device(self) -> None: """Initialize both scorer and proposer models. """ @@ -337,6 +347,16 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): assert len(sampler_output) == 1 sampler_output = sampler_output[0] + # Store hidden states from target model execution. + hidden_states = sampler_output.hidden_states + if hidden_states is not None: + if self.previous_hidden_states is None: + self.previous_hidden_states = HiddenStates( + execute_model_req.seq_group_metadata_list, hidden_states) + else: + self.previous_hidden_states.update( + execute_model_req.seq_group_metadata_list, hidden_states) + # Clear device tensors from sampler output. This reduces communication # overhead when the engine runs in a different process than the workers. sampler_output.probs = None @@ -383,6 +403,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): """ assert num_lookahead_slots == execute_model_req.num_lookahead_slots + # Pass last hidden states from target model to proposer + execute_model_req.previous_hidden_states = self.previous_hidden_states + self.previous_hidden_states = None + # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals(execute_model_req) @@ -466,6 +490,20 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): # metadata. accepted_token_ids[original_indices] = accepted_token_ids.clone() + hidden_states = proposal_scores.hidden_states + if hidden_states is not None: + # Contract hidden states based on accepted tokens + hs_size = hidden_states.shape[1] + hidden_states = hidden_states.reshape(-1, max_proposal_len + 1, + hs_size) + accepted_index = accepted_token_ids + 1 # Convert -1 to 0 + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) + index = accepted_index[:, None, None].expand(-1, 1, hs_size) + hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d + # Store hidden states from target model for subsequent decode step + self.previous_hidden_states = HiddenStates(seq_group_metadata_list, + hidden_states) + return accepted_token_ids, logprobs def _create_output_sampler_list( diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 278db94bf..d3e280e68 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -65,9 +65,13 @@ class Top1Proposer(SpeculativeProposer): # token_ids is like [batch] format in proposal_len size list, # while if it is false, the format would be [proposal_len] # in batch size list + hidden_states = execute_model_req.previous_hidden_states + if hidden_states is not None: + hidden_states.prune(nonzero_proposal_len_seqs) nonzero_execute_model_req = ExecuteModelRequest( seq_group_metadata_list=nonzero_proposal_len_seqs, num_lookahead_slots=proposal_len, + previous_hidden_states=hidden_states, ) maybe_sampler_output, transposed = self._worker.sampler_output( execute_model_req=nonzero_execute_model_req, diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 9bbe3f8d1..80710419e 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -10,14 +10,6 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, SeqId = int -def get_all_seq_ids( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] - - def get_all_num_logprobs( seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ada840182..60fc756a1 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,3 +1,4 @@ +import contextlib from typing import Dict, Optional, Type from transformers import PretrainedConfig @@ -5,7 +6,13 @@ from transformers import PretrainedConfig from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - JAISConfig, MPTConfig, RWConfig) + JAISConfig, MLPSpeculatorConfig, + MPTConfig, RWConfig) + +if VLLM_USE_MODELSCOPE: + from modelscope import AutoConfig +else: + from transformers import AutoConfig logger = init_logger(__name__) @@ -16,8 +23,13 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, + "mlp_speculator": MLPSpeculatorConfig, } +for name, cls in _CONFIG_REGISTRY.items(): + with contextlib.suppress(ValueError): + AutoConfig.register(name, cls) + def get_config(model: str, trust_remote_code: bool, @@ -26,10 +38,6 @@ def get_config(model: str, rope_scaling: Optional[dict] = None, rope_theta: Optional[float] = None) -> PretrainedConfig: try: - if VLLM_USE_MODELSCOPE: - from modelscope import AutoConfig - else: - from transformers import AutoConfig config = AutoConfig.from_pretrained( model, trust_remote_code=trust_remote_code, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 0e4869288..d8170858c 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -5,6 +5,7 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig +from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig __all__ = [ @@ -13,4 +14,5 @@ __all__ = [ "MPTConfig", "RWConfig", "JAISConfig", + "MLPSpeculatorConfig", ] diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py new file mode 100644 index 000000000..dd1d92b86 --- /dev/null +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -0,0 +1,50 @@ +from typing import List, Optional + +from transformers import PretrainedConfig + + +class MLPSpeculatorConfig(PretrainedConfig): + model_type = "mlp_speculator" + + attribute_map = { + "hidden_size": "emb_dim", + } + + def __init__(self, + vocab_size: int = 32000, + emb_dim: int = 4096, + inner_dim: int = 0, + n_predict: int = 3, + top_k_tokens_per_head: Optional[List[int]] = None, + n_candidates: int = 5, + **kwargs): + """ + Initialize an MLPSpeculatorConfig + + Args: + vocab_size: int + the model vocab size + emb_dim: int + the model embedding dimension + inner_dim: int + the inner dimension of the model. If 0, will be the emb_dim. + n_predict: int + the number of lookaheads for the speculator + top_k_tokens_per_head: List[int] + Number of tokens to consider from each head when forming the + candidate tree. + For each candidate branch in the tree, head n produces topk[n] + additional sub-branches. + n_candidates: int + number of child candidates to create per sequence + """ + if top_k_tokens_per_head is None: + top_k_tokens_per_head = [5, 4, 3] + assert len(top_k_tokens_per_head) == n_predict + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.inner_dim = inner_dim + self.n_predict = n_predict + self.top_k_tokens_per_head = top_k_tokens_per_head + self.n_candidates = n_candidates + super().__init__(**kwargs) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index d0baa4337..e24835a1e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -86,6 +86,7 @@ class ModelRunner: kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, vision_language_config: Optional[VisionLanguageConfig] = None, + return_hidden_states: bool = False, ): self.model_config = model_config self.parallel_config = parallel_config @@ -96,6 +97,7 @@ class ModelRunner: self.load_config = load_config self.is_driver_worker = is_driver_worker self.vision_language_config = vision_language_config + self.return_hidden_states = return_hidden_states self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -116,15 +118,17 @@ class ModelRunner: self.graph_block_tables = np.zeros( (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), dtype=np.int32) + num_attn_heads = self.model_config.get_num_attention_heads( + self.parallel_config) self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), + num_attn_heads, self.model_config.get_head_size(), self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, - ) + ) if num_attn_heads else None # Create processor for multi-modal data if self.vision_language_config is not None: @@ -762,11 +766,19 @@ class ModelRunner: return None # Sample the next token. - output = self.model.sample( + output: SamplerOutput = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, ) + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert seq_group_metadata_list is not None + if seq_group_metadata_list[0].is_prompt: + hidden_states = hidden_states.index_select( + 0, sampling_metadata.selected_token_indices) + output.hidden_states = hidden_states + return output @torch.inference_mode() diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index f9b8a065a..e334ffbb7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -70,6 +70,14 @@ class Worker(WorkerBase): assert not self.lora_config, ( "To be tested: vision language model with LoRA settings.") + # Return hidden states from target model if the draft model is an + # mlp_speculator + speculative_args = {} if speculative_config is None \ + or (speculative_config.draft_model_config.model == + model_config.model) \ + or (speculative_config.draft_model_config.hf_config.model_type != + "mlp_speculator") else {"return_hidden_states": True} + ModelRunnerClass = (EmbeddingModelRunner if self.model_config.embedding_mode else ModelRunner) self.model_runner = ModelRunnerClass( @@ -83,6 +91,7 @@ class Worker(WorkerBase): kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, vision_language_config=vision_language_config, + **speculative_args, ) # Uninitialized cache engine. Will be initialized by # initialize_cache. -- GitLab From 1f5674218f968dec625d0995fe5cd5d626db9188 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Fri, 21 Jun 2024 08:55:41 +0800 Subject: [PATCH 117/376] [Kernel] Add punica dimension for Qwen2 LoRA (#5441) --- csrc/punica/bgmv/bgmv_config.h | 38 ++++++++++++++++++++++++++++++++-- tests/lora/test_punica.py | 17 +++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index c38db2dcd..cb6694b30 100755 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -16,15 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 512) \ f(in_T, out_T, W_T, narrow, 640) \ f(in_T, out_T, W_T, narrow, 768) \ + f(in_T, out_T, W_T, narrow, 896) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1152) \ + f(in_T, out_T, W_T, narrow, 1216) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1536) \ f(in_T, out_T, W_T, narrow, 1664) \ f(in_T, out_T, W_T, narrow, 1728) \ f(in_T, out_T, W_T, narrow, 1792) \ f(in_T, out_T, W_T, narrow, 2048) \ + f(in_T, out_T, W_T, narrow, 2240) \ f(in_T, out_T, W_T, narrow, 2304) \ + f(in_T, out_T, W_T, narrow, 2368) \ + f(in_T, out_T, W_T, narrow, 2432) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ f(in_T, out_T, W_T, narrow, 2816) \ @@ -32,8 +37,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 3328) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ + f(in_T, out_T, W_T, narrow, 3712) \ f(in_T, out_T, W_T, narrow, 4096) \ + f(in_T, out_T, W_T, narrow, 4480) \ f(in_T, out_T, W_T, narrow, 4608) \ + f(in_T, out_T, W_T, narrow, 4736) \ + f(in_T, out_T, W_T, narrow, 4864) \ f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ @@ -43,8 +52,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 6848) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ + f(in_T, out_T, W_T, narrow, 7424) \ f(in_T, out_T, W_T, narrow, 8192) \ + f(in_T, out_T, W_T, narrow, 8960) \ f(in_T, out_T, W_T, narrow, 9216) \ + f(in_T, out_T, W_T, narrow, 9472) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ f(in_T, out_T, W_T, narrow, 11264) \ @@ -52,8 +64,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ + f(in_T, out_T, W_T, narrow, 14784) \ + f(in_T, out_T, W_T, narrow, 14848) \ f(in_T, out_T, W_T, narrow, 15360) \ f(in_T, out_T, W_T, narrow, 16384) \ + f(in_T, out_T, W_T, narrow, 18944) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 22528) \ @@ -61,6 +76,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 27648) \ f(in_T, out_T, W_T, narrow, 28672) \ + f(in_T, out_T, W_T, narrow, 29568) \ + f(in_T, out_T, W_T, narrow, 29696) \ f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32256) \ f(in_T, out_T, W_T, narrow, 32512) \ @@ -85,9 +102,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py -// Used for defining kernels going from the variety of +// Used for defining kernels going from the variety of // dim in to the narrow dim out - // Using it for the fully sharded column + // Using it for the fully sharded column // parallel LoRA A which splits the rank dim #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, 128, narrow) \ @@ -95,15 +112,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 512, narrow) \ f(in_T, out_T, W_T, 640, narrow) \ f(in_T, out_T, W_T, 768, narrow) \ + f(in_T, out_T, W_T, 896, narrow) \ f(in_T, out_T, W_T, 1024, narrow) \ f(in_T, out_T, W_T, 1152, narrow) \ + f(in_T, out_T, W_T, 1216, narrow) \ f(in_T, out_T, W_T, 1280, narrow) \ f(in_T, out_T, W_T, 1536, narrow) \ f(in_T, out_T, W_T, 1664, narrow) \ f(in_T, out_T, W_T, 1728, narrow) \ f(in_T, out_T, W_T, 1792, narrow) \ f(in_T, out_T, W_T, 2048, narrow) \ + f(in_T, out_T, W_T, 2240, narrow) \ f(in_T, out_T, W_T, 2304, narrow) \ + f(in_T, out_T, W_T, 2368, narrow) \ + f(in_T, out_T, W_T, 2432, narrow) \ f(in_T, out_T, W_T, 2560, narrow) \ f(in_T, out_T, W_T, 2752, narrow) \ f(in_T, out_T, W_T, 2816, narrow) \ @@ -111,8 +133,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 3328, narrow) \ f(in_T, out_T, W_T, 3456, narrow) \ f(in_T, out_T, W_T, 3584, narrow) \ + f(in_T, out_T, W_T, 3712, narrow) \ f(in_T, out_T, W_T, 4096, narrow) \ + f(in_T, out_T, W_T, 4480, narrow) \ f(in_T, out_T, W_T, 4608, narrow) \ + f(in_T, out_T, W_T, 4736, narrow) \ + f(in_T, out_T, W_T, 4864, narrow) \ f(in_T, out_T, W_T, 5120, narrow) \ f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ @@ -122,8 +148,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 6848, narrow) \ f(in_T, out_T, W_T, 6912, narrow) \ f(in_T, out_T, W_T, 7168, narrow) \ + f(in_T, out_T, W_T, 7424, narrow) \ f(in_T, out_T, W_T, 8192, narrow) \ + f(in_T, out_T, W_T, 8960, narrow) \ f(in_T, out_T, W_T, 9216, narrow) \ + f(in_T, out_T, W_T, 9472, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ f(in_T, out_T, W_T, 11264, narrow) \ @@ -131,8 +160,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ f(in_T, out_T, W_T, 14336, narrow) \ + f(in_T, out_T, W_T, 14784, narrow) \ + f(in_T, out_T, W_T, 14848, narrow) \ f(in_T, out_T, W_T, 15360, narrow) \ f(in_T, out_T, W_T, 16384, narrow) \ + f(in_T, out_T, W_T, 18944, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ f(in_T, out_T, W_T, 22528, narrow) \ @@ -140,6 +172,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 27648, narrow) \ f(in_T, out_T, W_T, 28672, narrow) \ + f(in_T, out_T, W_T, 29568, narrow) \ + f(in_T, out_T, W_T, 29696, narrow) \ f(in_T, out_T, W_T, 32000, narrow) \ f(in_T, out_T, W_T, 32256, narrow) \ f(in_T, out_T, W_T, 32512, narrow) \ diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index dae1d5687..110c9b243 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -49,21 +49,30 @@ H1 = H2 = [ 128, 256, 512, + 896, 1024, 1152, + 1216, 1280, 1536, 1664, 2048, + 2240, 2304, + 2368, + 2432, 2560, 2752, 3072, 3328, 3456, 3584, + 3712, 4096, + 4480, 4608, + 4736, + 4864, 5120, 5504, 5632, @@ -73,19 +82,27 @@ H1 = H2 = [ 6848, 6912, 7168, + 7424, 8192, + 8960, 9216, + 9472, 10240, 11008, 11264, 13824, 14336, + 14784, + 14848, 15360, + 18944, 22016, 22528, 24576, 27392, 27648, + 29568, + 29696, 32000, 32256, 32512, -- GitLab From c35e4a3dd74fa5952b04354a3c7cfd0ed09e2eb0 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Thu, 20 Jun 2024 21:45:34 -0700 Subject: [PATCH 118/376] [BugFix] Fix test_phi3v.py (#5725) --- tests/conftest.py | 4 +++- tests/models/test_phi3v.py | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5bbfd87f0..67885b932 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -233,11 +233,13 @@ class HfRunner: prompts: List[str], max_tokens: int, images: Optional[List[Image.Image]] = None, + **kwargs, ) -> List[Tuple[List[int], str]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, - images=images) + images=images, + **kwargs) return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 1732e8f08..234547598 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -77,7 +77,7 @@ if is_cpu(): # numeric difference for longer context and test can't pass @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("max_tokens", [128]) def test_models(hf_runner, vllm_runner, hf_images, vllm_images, model_and_config, dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. @@ -95,9 +95,11 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, hf_model_kwargs = {"_attn_implementation": "eager"} with hf_runner(model_id, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model: - hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, - max_tokens, - images=hf_images) + hf_outputs = hf_model.generate_greedy( + HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images, + eos_token_id=hf_model.processor.tokenizer.eos_token_id) vllm_image_prompts = [ p.replace("<|image_1|>", -- GitLab From 67005a07bc0991211ba2acccb3e56c72a47f9def Mon Sep 17 00:00:00 2001 From: Jee Li Date: Fri, 21 Jun 2024 12:46:28 +0800 Subject: [PATCH 119/376] [Bugfix] Add fully sharded layer for QKVParallelLinearWithLora (#5665) Co-authored-by: Antoni Baum --- tests/lora/test_baichuan.py | 14 +++++--- tests/lora/test_layers.py | 7 ++-- vllm/lora/fully_sharded_layers.py | 58 +++++++++++++++++++++++++++++-- vllm/lora/layers.py | 36 +++++++++++-------- vllm/lora/utils.py | 4 ++- 5 files changed, 93 insertions(+), 26 deletions(-) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index e1b81655c..56cec4db8 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -64,7 +64,8 @@ def test_baichuan_lora(baichuan_lora_files): @pytest.mark.skip("Requires multiple GPUs") -def test_baichuan_tensor_parallel_equality(baichuan_lora_files): +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded): # Cannot use as it will initialize torch.cuda too early... # if torch.cuda.device_count() < 4: # pytest.skip(f"Not enough GPUs for tensor parallelism {4}") @@ -75,7 +76,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=1, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) del llm_tp1 @@ -87,7 +89,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=2, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) del llm_tp2 @@ -101,10 +104,11 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=4, - trust_remote_code=True) + trust_remote_code=True, + fully_sharded_loras=fully_sharded) output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) del llm_tp4 cleanup() - assert output_tp1 == output_tp4 \ No newline at end of file + assert output_tp1 == output_tp4 diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 4b489670f..2e51e95a3 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -12,7 +12,8 @@ from vllm.config import LoRAConfig from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA) + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, @@ -684,7 +685,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, bias=False, params_dtype=torch.float16) linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = QKVParallelLinearWithLora(linear) + lora_linear = QKVParallelLinearWithLora( + linear + ) if not fully_shard else QKVParallelLinearWithShardedLora(linear) @dataclass class FakeConfig: diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index ffdc32b73..d27171f72 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -12,6 +12,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLora, + QKVParallelLinearWithLora, RowParallelLinearWithLoRA) from vllm.lora.punica import bgmv, dispatch_bgmv_low_level @@ -90,11 +91,11 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): def _mcp_apply(x, bias, layer): """ MergedColumnParallelLinearWithShardedLoRA and - QKVParallelLinearWithShardedLora share the same + MergedQKVParallelLinearWithShardedLora share the same LoRa weight application method. The main difference is the step by shard_size for lora_b which can - vary for QKVParallelLinearWithShardedLora but is constant for + vary for MergedQKVParallelLinearWithShardedLora but is constant for MergedColumnParallelLinearWithShardedLoRA. """ # expecting 2 for column parallel and 3 for qkv @@ -167,7 +168,7 @@ class MergedColumnParallelLinearWithShardedLoRA( ) -class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): +class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): """ Differs from QKVParallelLinearWithLora by slicing the LoRA A's also. @@ -175,6 +176,57 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): Based on S-LoRA, slicing happens along the rank dim. """ + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.lora_a_stacked.shape[2] + start_idx = tp_rank * shard_size + lora_a = lora_a[:, start_idx:start_idx + shard_size] + return lora_a + + def apply(self, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, + output.shape[-1]), output.shape + buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), + dtype=torch.float32, + device=x.device) + + bgmv(buffer, x, self.lora_a_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + buffer = tensor_model_parallel_all_gather(buffer) + bgmv(output, buffer, self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + # now have column partitioned output + + output = output.view(*out_orig_shape) + return output + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer(cls, source_layer: nn.Module, + lora_config: LoRAConfig, packed_modules_list: List, + model_config: Optional[PretrainedConfig]) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): + """ + Differs from MergedQKVParallelLinearWithLora by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e3ab1708c..e4a23273f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -641,6 +641,24 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + self.q_shard_id = tp_rank + self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas + lora_b_q = lora_b[:, self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + lora_b_k = lora_b[:, k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + lora_b_v = lora_b[:, v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + return lora_b + def set_lora( self, index: int, @@ -650,21 +668,8 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): ): self.reset_lora(index) if self.tp_size > 1: - tp_rank = get_tensor_model_parallel_rank() - self.q_shard_id = tp_rank - self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - lora_b_q = lora_b[:, self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - k_offset = self.q_proj_total_size - lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size * - self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - v_offset = k_offset + self.kv_proj_total_size - lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size * - self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -674,6 +679,7 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): lora_b.T, non_blocking=True) @classmethod + @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: List, model_config: Optional[PretrainedConfig]) -> bool: diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 4a86c16cf..ab3b99eee 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -8,7 +8,8 @@ from vllm.logger import init_logger from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA) + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable @@ -35,6 +36,7 @@ _all_lora_classes: Set[Type[BaseLayerWithLoRA]] = { RowParallelLinearWithLoRA, LogitsProcessorWithLoRA, ColumnParallelLinearWithShardedLoRA, + QKVParallelLinearWithShardedLora, MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA, -- GitLab From d9a252bc8e8a2741d8a2997032a94208fb8f29d9 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 20 Jun 2024 22:12:35 -0700 Subject: [PATCH 120/376] [Core][Distributed] add shm broadcast (#5399) Co-authored-by: Cody Yu --- .buildkite/test-pipeline.yaml | 4 +- tests/distributed/test_shm_broadcast.py | 82 ++++++ .../device_communicators/shm_broadcast.py | 259 ++++++++++++++++++ vllm/distributed/parallel_state.py | 44 ++- vllm/envs.py | 5 + 5 files changed, 384 insertions(+), 10 deletions(-) create mode 100644 tests/distributed/test_shm_broadcast.py create mode 100644 vllm/distributed/device_communicators/shm_broadcast.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5e92ba3c2..c337a81d4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -28,9 +28,11 @@ steps: - label: Distributed Comm Ops Test #mirror_hardwares: [amd] - command: pytest -v -s distributed/test_comm_ops.py working_dir: "/vllm-workspace/tests" num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py new file mode 100644 index 000000000..d92900ffc --- /dev/null +++ b/tests/distributed/test_shm_broadcast.py @@ -0,0 +1,82 @@ +import multiprocessing +import random +import time + +import torch.distributed as dist + +from vllm.distributed.device_communicators.shm_broadcast import ( + ShmRingBuffer, ShmRingBufferIO) +from vllm.utils import update_environment_variables + + +def distributed_run(fn, world_size): + number_of_processes = world_size + processes = [] + for i in range(number_of_processes): + env = {} + env['RANK'] = str(i) + env['LOCAL_RANK'] = str(i) + env['WORLD_SIZE'] = str(number_of_processes) + env['LOCAL_WORLD_SIZE'] = str(number_of_processes) + env['MASTER_ADDR'] = 'localhost' + env['MASTER_PORT'] = '12345' + p = multiprocessing.Process(target=fn, args=(env, )) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def worker_fn_wrapper(fn): + # `multiprocessing.Process` cannot accept environment variables directly + # so we need to pass the environment variables as arguments + # and update the environment variables in the function + def wrapped_fn(env): + update_environment_variables(env) + dist.init_process_group(backend="gloo") + fn() + + return wrapped_fn + + +@worker_fn_wrapper +def worker_fn(): + writer_rank = 2 + broadcaster = ShmRingBufferIO.create_from_process_group( + dist.group.WORLD, 1024, 2, writer_rank) + if dist.get_rank() == writer_rank: + time.sleep(random.random()) + broadcaster.broadcast_object(0) + time.sleep(random.random()) + broadcaster.broadcast_object({}) + time.sleep(random.random()) + broadcaster.broadcast_object([]) + else: + time.sleep(random.random()) + a = broadcaster.broadcast_object(None) + time.sleep(random.random()) + b = broadcaster.broadcast_object(None) + time.sleep(random.random()) + c = broadcaster.broadcast_object(None) + assert a == 0 + assert b == {} + assert c == [] + dist.barrier() + + +def test_shm_broadcast(): + distributed_run(worker_fn, 4) + + +def test_singe_process(): + buffer = ShmRingBuffer(1, 1024, 4) + reader = ShmRingBufferIO(buffer, reader_rank=0) + writer = ShmRingBufferIO(buffer, reader_rank=-1) + writer.enqueue([0]) + writer.enqueue([1]) + assert reader.dequeue() == [0] + assert reader.dequeue() == [1] diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py new file mode 100644 index 000000000..119befcf6 --- /dev/null +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -0,0 +1,259 @@ +import pickle +import time +from contextlib import contextmanager +from multiprocessing import shared_memory +from typing import Optional +from unittest.mock import patch + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm.logger import init_logger + +VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL + +logger = init_logger(__name__) + + +class ShmRingBuffer: + + def __init__(self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: Optional[str] = None): + """ + A shared memory ring buffer implementation for broadcast communication. + Essentially, it is a queue where only one will `enqueue` and multiple + will `dequeue`. The max size of each item, together with the max number + of items that can be stored in the buffer are known in advance. + In this case, we don't need to synchronize the access to + the buffer. + + Buffer memory layout: + data metadata + | | + | (current_idx) | (current_idx) + v v + +-------------------------------+----------------------------------------+ + | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata | + +-------------------------------+----------------------------------------+ + | max_chunks x max_chunk_bytes | max_chunks x (1 + n_reader) bytes | + + metadata memory layout: each byte is a flag, the first byte is the written + flag, and the rest are reader flags. The flags are set to 0 by default. + +--------------+--------------+--------------+-----+--------------+ + | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | + +--------------+--------------+--------------+-----+--------------+ + + During creation, `name` is None and the buffer is created. We can pass the + created object to other processes by pickling it. The other processes will + get the name of the shared memory and open it, so that they can access the + same shared memory buffer. + """# noqa + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = (self.max_chunk_bytes + + self.metadata_size) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer) + # initialize the metadata section to 0 + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + self.shared_memory = shared_memory.SharedMemory(name=name) + assert self.shared_memory.size == self.total_bytes_of_buffer + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8) + assert torch.all(tensor == 0) + + def __reduce__(self): + return ( + self.__class__, + (self.n_reader, self.max_chunk_bytes, self.max_chunks, + self.shared_memory.name), + ) + + def __del__(self): + self.shared_memory.close() + if self.is_creator: + self.shared_memory.unlink() + + @contextmanager + def get_data(self, current_idx: int): + start = self.data_offset + current_idx * self.max_chunk_bytes + end = start + self.max_chunk_bytes + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + @contextmanager + def get_metadata(self, current_idx: int): + start = self.metadata_offset + current_idx * self.metadata_size + end = start + self.metadata_size + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + +class ShmRingBufferIO: + + def __init__(self, buffer: ShmRingBuffer, reader_rank: int): + self.buffer = buffer + self.reader_rank = reader_rank + self._is_writer = self.reader_rank == -1 + self._is_reader = not self._is_writer + if self._is_reader: + assert 0 <= self.reader_rank < buffer.n_reader, \ + (f"Invalid reader rank {self.reader_rank} for buffer" + f" created with {buffer.n_reader} readers") + self.current_idx = 0 + + @contextmanager + def acquire_write(self): + assert self._is_writer, "Only writers can acquire write" + start_index = self.current_idx + start_time = time.time() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_count = sum(metadata_buffer[1:]) + written_flag = metadata_buffer[0] + if written_flag and read_count != self.buffer.n_reader: + # this block is written and not read by all readers + # try to write to the next block + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + if self.current_idx == start_index: + # no empty block found + if time.time( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + # wait for a while (0.1 us) + time.sleep(1e-7) + continue + # found a block that is either + # (1) not written + # (2) read by all readers + + # mark the block as not written + metadata_buffer[0] = 0 + # let caller write to the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has written to the buffer + # mark the block as written + metadata_buffer[0] = 1 + for i in range(1, self.buffer.n_reader + 1): + # set read flag to 0, meaning it is not read yet + metadata_buffer[i] = 0 + break + + @contextmanager + def acquire_read(self): + assert self._is_reader, "Only readers can acquire read" + start_index = self.current_idx + start_time = time.time() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_flag = metadata_buffer[self.reader_rank + 1] + written_flag = metadata_buffer[0] + if not written_flag or read_flag: + # this block is either + # (1) not written + # (2) already read by this reader + # try to read the next block + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + if self.current_idx == start_index: + # no block found + if time.time( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + # wait for a while (0.1 us) + time.sleep(1e-7) + continue + # found a block that is not read by this reader + # let caller read from the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has read from the buffer + # set the read flag + metadata_buffer[self.reader_rank + 1] = 1 + break + + def enqueue(self, obj): + assert self._is_writer, "Only writers can enqueue" + serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + if len(serialized_obj) > self.buffer.max_chunk_bytes: + raise RuntimeError( + f"{len(serialized_obj)=} larger than the allowed value " + f"{self.buffer.max_chunk_bytes}," + "Please increase the max_chunk_bytes parameter.") + with self.acquire_write() as buf: + buf[:len(serialized_obj)] = serialized_obj + + def dequeue(self): + assert self._is_reader, "Only readers can dequeue" + with self.acquire_read() as buf: + # no need to know the size of serialized object + # pickle format itself contains the size information internally + # see https://docs.python.org/3/library/pickle.html + obj = pickle.loads(buf) + return obj + + def broadcast_object(self, obj=None): + if self._is_writer: + self.enqueue(obj) + return obj + else: + return self.dequeue() + + def create_from_process_group(pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + writer_rank=0) -> "ShmRingBufferIO": + group_rank = dist.get_rank(pg) + group_world_size = dist.get_world_size(pg) + ranks_inside_group = list(range(group_world_size)) + global_ranks = dist.get_process_group_ranks(pg) + n_reader = group_world_size - 1 + buffer: ShmRingBuffer + if group_rank == writer_rank: + buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks) + dist.broadcast_object_list([buffer], src=global_ranks[writer_rank]) + dist.barrier(pg) + return ShmRingBufferIO(buffer, -1) + else: + recv = [None] + dist.broadcast_object_list(recv, src=global_ranks[writer_rank]) + dist.barrier(pg) + buffer = recv[0] # type: ignore + rest_ranks = [r for r in ranks_inside_group if r != writer_rank] + return ShmRingBufferIO(buffer, rest_ranks.index(group_rank)) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 02b0dcbcb..5188fadbb 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -98,6 +98,7 @@ class GroupCoordinator: # communicators are only created for world size > 1 pynccl_comm: Optional[Any] # PyNccl communicator ca_comm: Optional[Any] # Custom allreduce communicator + shm_broadcaster: Optional[Any] # shared memory broadcaster def __init__( self, @@ -162,6 +163,13 @@ class GroupCoordinator: else: self.ca_comm = None + from vllm.distributed.device_communicators.shm_broadcast import ( + ShmRingBufferIO) + self.shm_broadcaster: Optional[ShmRingBufferIO] = None + if self.world_size > 1 and is_in_the_same_node(self.cpu_group): + self.shm_broadcaster = ShmRingBufferIO.create_from_process_group( + self.cpu_group, 1 << 20, 6) + @property def first_rank(self): """Return the global rank of the first process in the group""" @@ -324,6 +332,30 @@ class GroupCoordinator: group=self.device_group) return input_ + def broadcast_object(self, obj: Optional[Any] = None, src: int = 0): + """Broadcast the input object. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj + if self.shm_broadcaster is not None: + assert src == 0, "Shared memory broadcaster only supports src=0" + return self.shm_broadcaster.broadcast_object(obj) + if self.rank_in_group == src: + torch.distributed.broadcast_object_list([obj], + src=self.ranks[src], + group=self.cpu_group) + return obj + else: + recv = [None] + torch.distributed.broadcast_object_list(recv, + src=self.ranks[src], + group=self.cpu_group) + return recv[0] + def broadcast_object_list(self, obj_list: List[Any], src: int = 0, @@ -371,9 +403,7 @@ class GroupCoordinator: # `metadata_list` lives in CPU memory. # `broadcast_object_list` has serialization & deserialization, # all happening on CPU. Therefore, we can use the CPU group. - torch.distributed.broadcast_object_list([metadata_list], - src=src, - group=metadata_group) + self.broadcast_object(metadata_list, src=src) async_handles = [] for tensor in tensor_list: if tensor.numel() == 0: @@ -396,14 +426,10 @@ class GroupCoordinator: async_handle.wait() else: - recv_metadata_list = [None] - torch.distributed.broadcast_object_list(recv_metadata_list, - src=src, - group=metadata_group) - assert recv_metadata_list[0] is not None + metadata_list = self.broadcast_object(None, src=src) tensor_dict = {} async_handles = [] - for key, value in recv_metadata_list[0]: + for key, value in metadata_list: if isinstance(value, TensorMetadata): tensor = torch.empty(value.size, dtype=value.dtype, diff --git a/vllm/envs.py b/vllm/envs.py index ae2fcd082..49277e2d3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -5,6 +5,7 @@ if TYPE_CHECKING: VLLM_HOST_IP: str = "" VLLM_PORT: Optional[int] = None VLLM_USE_MODELSCOPE: bool = False + VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 VLLM_INSTANCE_ID: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None @@ -114,6 +115,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_INSTANCE_ID": lambda: os.environ.get("VLLM_INSTANCE_ID", None), + # Interval in seconds to log a warning message when the ring buffer is full + "VLLM_RINGBUFFER_WARNING_INTERVAL": + lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")), + # path to cudatoolkit home directory, under which should be bin, include, # and lib directories. "CUDA_HOME": -- GitLab From bd620b01fb74d5269ca6fc0fd32f66bfb205a358 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 20 Jun 2024 23:39:40 -0700 Subject: [PATCH 121/376] [Kernel][CPU] Add Quick `gelu` to CPU (#5717) --- csrc/cpu/activation.cpp | 19 +++++++++++++++++++ csrc/cpu/torch_bindings.cpp | 4 ++++ vllm/_ipex_ops.py | 3 +++ vllm/model_executor/layers/activation.py | 3 +++ 4 files changed, 29 insertions(+) diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp index becd2ac42..039b8d5c3 100644 --- a/csrc/cpu/activation.cpp +++ b/csrc/cpu/activation.cpp @@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) { return w3 * x * (ones + t); } +FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) { + const vec_op::FP32Vec8 zeros(0.0); + const vec_op::FP32Vec8 ones(1.0); + const vec_op::FP32Vec8 w1(1.702f); + return x / (ones + (zeros - w1 * x).exp()); +} + FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT1_2); @@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) { CPU_KERNEL_GUARD_OUT(gelu_fast_impl) }); } + +void gelu_quick(torch::Tensor& out, torch::Tensor& input) { + int num_tokens = input.numel() / input.size(-1); + int d = input.size(-1); + + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] { + CPU_KERNEL_GUARD_IN(gelu_quick_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(gelu_quick_impl) + }); +} diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index a2bf0d49a..39e8cf3ed 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCPU, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCPU, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 1e60e0848..99a875c9b 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -43,6 +43,9 @@ class ipex_ops: def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: out.copy_(torch.nn.functional.gelu(x)) + # TODO add implementation of gelu_quick here + # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + def paged_attention_v1( out: torch.Tensor, query: torch.Tensor, diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 80cad15b4..5bfdba67b 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -155,6 +155,9 @@ class QuickGELU(CustomOp): ops.gelu_quick(out, x) return out + # TODO implement forward_xpu for QuickGELU + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. -- GitLab From 5b15bde5399cbcb1052bfb49584f81ed300cd4ac Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 21 Jun 2024 12:44:29 -0400 Subject: [PATCH 122/376] [Doc] Documentation on supported hardware for quantization methods (#5745) --- docs/source/index.rst | 1 + docs/source/quantization/fp8.rst | 4 ++- .../quantization/supported_hardware.rst | 30 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 docs/source/quantization/supported_hardware.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 8795a865c..05133eb6d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -100,6 +100,7 @@ Documentation :maxdepth: 1 :caption: Quantization + quantization/supported_hardware quantization/auto_awq quantization/fp8 quantization/fp8_e5m2_kvcache diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index 312a56459..09f313664 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -3,7 +3,9 @@ FP8 ================== -vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are supported. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst new file mode 100644 index 000000000..df445e00a --- /dev/null +++ b/docs/source/quantization/supported_hardware.rst @@ -0,0 +1,30 @@ +.. _supported_hardware_for_quantization: + +Supported Hardware for Quantization Kernels +=========================================== + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +FP8 ❌ ❌ ❌ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== + +Notes: +^^^^^^ + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅" indicates that the quantization method is supported on the specified hardware. +- "❌" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file -- GitLab From f1e72cc19a21928400b63743d5fe164ec8ed30e8 Mon Sep 17 00:00:00 2001 From: zhyncs Date: Sat, 22 Jun 2024 03:15:48 +0800 Subject: [PATCH 123/376] [BugFix] exclude version 1.15.0 for modelscope (#5668) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5b3e682a8..d031d98c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -172,7 +172,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope + pip install accelerate hf_transfer 'modelscope!=1.15.0' ENV VLLM_USAGE_SOURCE production-docker-image -- GitLab From 7187507301aa8361407e04be42d0d50680891493 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 21 Jun 2024 14:04:26 -0700 Subject: [PATCH 124/376] [ci][test] fix ca test in main (#5746) --- .buildkite/test-pipeline.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c337a81d4..0b87e6280 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -197,6 +197,9 @@ steps: gpu: a100 num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py -- GitLab From f5dda63eb5fcb5624b93fa5f09da01d5372bbce4 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Fri, 21 Jun 2024 15:42:46 -0700 Subject: [PATCH 125/376] [LoRA] Add support for pinning lora adapters in the LRU cache (#5603) --- tests/lora/test_lora_manager.py | 64 +++++++++++++++++++++++ vllm/engine/llm_engine.py | 3 ++ vllm/executor/cpu_executor.py | 3 ++ vllm/executor/distributed_gpu_executor.py | 7 +++ vllm/executor/executor_base.py | 4 ++ vllm/executor/gpu_executor.py | 4 ++ vllm/executor/neuron_executor.py | 3 ++ vllm/lora/models.py | 26 +++++++++ vllm/lora/worker_manager.py | 3 ++ vllm/utils.py | 43 +++++++++++++-- vllm/worker/model_runner.py | 5 ++ vllm/worker/worker.py | 3 ++ vllm/worker/worker_base.py | 8 +++ 13 files changed, 171 insertions(+), 5 deletions(-) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 51a56b121..2133bce14 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -209,6 +209,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): assert manager.activate_lora(3) assert manager.lora_index_to_id[0] == 2 assert manager.lora_index_to_id[1] == 3 + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.activate_lora(1) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.deactivate_lora(2) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_lora(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.pin_lora(3) + assert manager.pin_lora(1) + with pytest.raises(RuntimeError): + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + with pytest.raises(RuntimeError): + assert manager.activate_lora(2) + + assert manager.deactivate_lora(3) + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.remove_lora(3) + with pytest.raises(ValueError): + assert manager.pin_lora(3) def test_lru_lora_model_manager(dist_init, dummy_model): @@ -288,6 +316,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert set(manager.list_loras()) == set() assert all(x is None for x in manager.lora_index_to_id) + # pinning + assert manager.add_lora(model_lora3) + assert manager.activate_lora(3) + assert manager.add_lora(model_lora4) + assert manager.activate_lora(4) + assert set(manager.list_loras()) == {3, 4} + with pytest.raises(ValueError): + assert manager.pin_lora(1) + assert manager.pin_lora(3) + # Remove manually + assert manager.remove_lora(3) + assert not manager.remove_lora(3) + + assert set(manager.list_loras()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.add_lora(model_lora1) + assert manager.pin_lora(1) + assert manager.add_lora(model_lora2) + assert manager.activate_lora(2) + + assert set(manager.list_loras()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + assert manager.remove_oldest_lora() + assert set(manager.list_loras()) == {1} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] is None + + with pytest.raises(RuntimeError): + assert manager.remove_oldest_lora() + + assert set(manager.list_loras()) == {1} + def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 75d417f52..f7eae257f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1009,6 +1009,9 @@ class LLMEngine: def list_loras(self) -> Set[int]: return self.model_executor.list_loras() + def pin_lora(self, lora_id: int) -> bool: + return self.model_executor.pin_lora(lora_id) + def check_health(self) -> None: self.model_executor.check_health() diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index a2212459f..6137cecd8 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -84,6 +84,9 @@ class CPUExecutor(ExecutorBase): def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index f7c608af1..235b5bc47 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor): lora_id=lora_id, ) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "pin_lora", + lora_id=lora_id, + ) + def list_loras(self) -> Set[int]: return self._run_workers("list_loras") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 4d01939c2..7c2520b5a 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -86,6 +86,10 @@ class ExecutorBase(ABC): def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError # type: ignore + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 3ad201f47..0a654200e 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -99,6 +99,10 @@ class GPUExecutor(ExecutorBase): assert lora_id > 0, "lora_id must be greater than 0." return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index e7f0e8879..c5e2fb0f6 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -65,6 +65,9 @@ class NeuronExecutor(ExecutorBase): def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3e8285686..afb9ba455 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -525,6 +525,12 @@ class LoRAModelManager: self.long_lora_context.offsets_by_lora_id.pop(lora_id, None) return bool(self._registered_loras.pop(lora_id, None)) + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + raise NotImplementedError( + "Pinning is not supported in LoRAModelManager." + "Use LRUCacheLoRAModelManager for pinning") # type: ignore + # TODO see if this can be vectorized def _set_lora_mapping(self, mapping: LoRAMapping) -> None: (base_indices, sampler_indices, sampler_indices_padded, @@ -777,6 +783,26 @@ class LRUCacheLoRAModelManager(LoRAModelManager): return True return False + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + self._pin_lora_in_cpu_cache(lora_id) + self._pin_lora_in_gpu_cache(lora_id) + return True + + def _pin_lora_in_cpu_cache(self, lora_id: int): + try: + self._registered_loras.pin(lora_id) + except ValueError as err: + raise ValueError("Pinning failed. " + f"LoRA {lora_id} is not registered.") from err + + def _pin_lora_in_gpu_cache(self, lora_id: int): + if lora_id not in self._active_loras: + # move lora to gpu if not already active + self.activate_lora(lora_id) + + self._active_loras.pin(lora_id) + def create_lora_manager( model: nn.Module, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 498b2b9dd..ca4903c23 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -221,6 +221,9 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager): def remove_lora(self, lora_id: int) -> bool: return self._lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self._lora_manager.pin_lora(lora_id) + def remove_all_loras(self): self._lora_manager.remove_all_loras() diff --git a/vllm/utils.py b/vllm/utils.py index 27a7b1042..ce5c377ef 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -15,7 +15,7 @@ from collections import defaultdict from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, - Hashable, List, Optional, OrderedDict, Tuple, TypeVar, + Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar, Union) import numpy as np @@ -44,6 +44,13 @@ K = TypeVar("K") T = TypeVar("T") +class _Sentinel: + ... + + +ALL_PINNED_SENTINEL = _Sentinel() + + class Device(enum.Enum): GPU = enum.auto() CPU = enum.auto() @@ -67,6 +74,7 @@ class LRUCache(Generic[T]): def __init__(self, capacity: int): self.cache: OrderedDict[Hashable, T] = OrderedDict() + self.pinned_items: Set[Hashable] = set() self.capacity = capacity def __contains__(self, key: Hashable) -> bool: @@ -102,14 +110,36 @@ class LRUCache(Generic[T]): self.cache.move_to_end(key) self._remove_old_if_needed() + def pin(self, key: Hashable) -> None: + """ + Pins a key in the cache preventing it from being + evicted in the LRU order. + """ + if key not in self.cache: + raise ValueError(f"Cannot pin key: {key} not in cache.") + self.pinned_items.add(key) + + def _unpin(self, key: Hashable) -> None: + self.pinned_items.remove(key) + def _on_remove(self, key: Hashable, value: Optional[T]): pass - def remove_oldest(self): + def remove_oldest(self, remove_pinned=False): if not self.cache: return - key, value = self.cache.popitem(last=False) - self._on_remove(key, value) + + if not remove_pinned: + # pop the oldest item in the cache that is not pinned + lru_key = next( + (key for key in self.cache if key not in self.pinned_items), + ALL_PINNED_SENTINEL) + if lru_key is ALL_PINNED_SENTINEL: + raise RuntimeError("All items are pinned, " + "cannot remove oldest from the cache.") + else: + lru_key = next(iter(self.cache)) + self.pop(lru_key) def _remove_old_if_needed(self) -> None: while len(self.cache) > self.capacity: @@ -120,13 +150,16 @@ class LRUCache(Generic[T]): default_value: Optional[T] = None) -> Optional[T]: run_on_remove = key in self.cache value: Optional[T] = self.cache.pop(key, default_value) + # remove from pinned items + if key in self.pinned_items: + self._unpin(key) if run_on_remove: self._on_remove(key, value) return value def clear(self): while len(self.cache) > 0: - self.remove_oldest() + self.remove_oldest(remove_pinned=True) self.cache.clear() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e24835a1e..a321eafce 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -878,6 +878,11 @@ class ModelRunner: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_lora(lora_id) + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e334ffbb7..c60764ef1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -333,6 +333,9 @@ class Worker(WorkerBase): def remove_lora(self, lora_id: int) -> bool: return self.model_runner.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 3d52fd71e..dc09718de 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -70,6 +70,10 @@ class WorkerBase(ABC): def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError @@ -86,6 +90,10 @@ class LoraNotSupportedWorkerBase(WorkerBase): def remove_lora(self, lora_id: int) -> bool: raise ValueError(f"{type(self)} does not support LoRA") + def pin_lora(self, lora_id: int) -> bool: + return ValueError( + f"{type(self)} does not support LoRA") # type: ignore + def list_loras(self) -> Set[int]: raise ValueError(f"{type(self)} does not support LoRA") -- GitLab From cf90ae01237018f70573f69c599d26648ff7740b Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 22 Jun 2024 08:09:34 +0800 Subject: [PATCH 126/376] [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci pipeline (#5616) --- .buildkite/test-template-aws.j2 | 10 ++++++++-- README.md | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index fb34b787e..1a7fb44c2 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -42,12 +42,18 @@ steps: command: bash .buildkite/run-neuron-test.sh soft_fail: false - - label: "Intel Test" + - label: "Intel CPU Test" depends_on: ~ agents: - queue: intel + queue: intel-cpu command: bash .buildkite/run-cpu-test.sh + - label: "Intel GPU Test" + depends_on: ~ + agents: + queue: intel-gpu + command: bash .buildkite/run-xpu-test.sh + {% for step in steps %} {% if step.gpu == "a100" %} - label: "{{ step.label }}" diff --git a/README.md b/README.md index c24768bf7..3e0da945d 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs +- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs - (Experimental) Prefix caching support - (Experimental) Multi-lora support -- GitLab From 9c62db07ed8ee28d9f1a0e6ac215446d49532008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Sat, 22 Jun 2024 10:07:08 +0800 Subject: [PATCH 127/376] [Model] Support Qwen-VL and Qwen-VL-Chat models with text-only inputs (#5710) Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d22ea6b79..b6ea6ab39 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -28,6 +28,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput +from vllm.utils import print_warning_once class QWenMLP(nn.Module): @@ -288,6 +289,15 @@ class QWenLMHeadModel(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip loading visual weights to support Qwen-VL models + # in cases with text-only inputs + # TODO: add support for Qwen-VL + if (name not in params_dict + and name.startswith("transformer.visual.")): + print_warning_once( + "Only text inputs are allowed. Images won't be handled " + "until Qwen-VL models are fully supported.") + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) -- GitLab From ff9ddbceee63efba6ba1f8d4dc66a92f1191da04 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Fri, 21 Jun 2024 20:33:12 -0700 Subject: [PATCH 128/376] [Misc] Remove #4789 workaround left in vllm/entrypoints/openai/run_batch.py (#5756) --- vllm/entrypoints/openai/run_batch.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 488ac8971..dac6c2b4c 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,5 +1,4 @@ import asyncio -import sys from io import StringIO from typing import Awaitable, List @@ -137,9 +136,6 @@ async def main(args): output_buffer.seek(0) await write_file(args.output_file, output_buffer.read().strip()) - # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789 - sys.exit(0) - if __name__ == "__main__": args = parse_args() -- GitLab From 0cbc1d2b4ff9e3afa32ffd2d5d308c136c2d15e3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Jun 2024 22:25:14 -0700 Subject: [PATCH 129/376] [Bugfix] Fix pin_lora error in TPU executor (#5760) --- vllm/executor/tpu_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 7061ad85f..5ed00e137 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -82,6 +82,9 @@ class TPUExecutor(ExecutorBase): def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for TPU backend.") + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for TPU backend.") -- GitLab From 8c00f9c15d13aed34b129b31c32a227be230e218 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Jun 2024 23:09:40 -0700 Subject: [PATCH 130/376] [Docs][TPU] Add installation tip for TPU (#5761) --- .../getting_started/tpu-installation.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 3627600e1..e96aabbb6 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds: .. code-block:: console $ VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +.. tip:: + + If you encounter the following error: + + .. code-block:: console + + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + + + You can install OpenBLAS with the following command: + + .. code-block:: console + + $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + -- GitLab From 832ea88fcb4819037b685fb47b3a0de37f2804d3 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 22 Jun 2024 10:00:43 -0700 Subject: [PATCH 131/376] [core][distributed] improve shared memory broadcast (#5754) --- .../device_communicators/shm_broadcast.py | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 119befcf6..c44bd2f11 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -48,6 +48,26 @@ class ShmRingBuffer: | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | +--------------+--------------+--------------+-----+--------------+ + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + During creation, `name` is None and the buffer is created. We can pass the created object to other processes by pickling it. The other processes will get the name of the shared memory and open it, so that they can access the @@ -81,10 +101,6 @@ class ShmRingBuffer: lambda *args, **kwargs: None): self.shared_memory = shared_memory.SharedMemory(name=name) assert self.shared_memory.size == self.total_bytes_of_buffer - with memoryview(self.shared_memory.buf[self.metadata_offset:] - ) as metadata_buffer: - tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8) - assert torch.all(tensor == 0) def __reduce__(self): return ( @@ -163,11 +179,15 @@ class ShmRingBufferIO: yield buf # caller has written to the buffer - # mark the block as written - metadata_buffer[0] = 1 + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block for i in range(1, self.buffer.n_reader + 1): # set read flag to 0, meaning it is not read yet metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 break @contextmanager @@ -247,13 +267,15 @@ class ShmRingBufferIO: buffer: ShmRingBuffer if group_rank == writer_rank: buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks) - dist.broadcast_object_list([buffer], src=global_ranks[writer_rank]) - dist.barrier(pg) + dist.broadcast_object_list([buffer], + src=global_ranks[writer_rank], + group=pg) return ShmRingBufferIO(buffer, -1) else: recv = [None] - dist.broadcast_object_list(recv, src=global_ranks[writer_rank]) - dist.barrier(pg) + dist.broadcast_object_list(recv, + src=global_ranks[writer_rank], + group=pg) buffer = recv[0] # type: ignore rest_ranks = [r for r in ranks_inside_group if r != writer_rank] return ShmRingBufferIO(buffer, rest_ranks.index(group_rank)) -- GitLab From 6c916ac8a80d1b2f4e0d0113a67767dc254a3598 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 24 Jun 2024 02:37:11 +0530 Subject: [PATCH 132/376] [BugFix] [Kernel] Add Cutlass2x fallback kernels (#5744) Co-authored-by: Varun Sundar Rabindranath --- csrc/quantization/cutlass_w8a8/common.hpp | 8 +++ .../cutlass_w8a8/scaled_mm_c2x.cu | 52 ++++++++++++++++--- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 23d0587bb..bf04bb400 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -17,3 +17,11 @@ inline uint32_t next_pow_2(uint32_t const num) { return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); } +inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { + int max_shared_mem_per_block_opt_in = 0; + cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, + cudaDevAttrMaxSharedMemoryPerBlockOptin, + device); + return max_shared_mem_per_block_opt_in; +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 740b9fb64..38a20a172 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -250,12 +250,39 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(status); } +template +void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + // In some cases, the GPU isn't able to accommodate the + // shared memory requirements of the Gemm. In such cases, use + // the FallbackGemm instead. + static const int max_shared_mem_per_block_opt_in = + get_cuda_max_shared_memory_per_block_opt_in(0); + + size_t const gemm_shared_mem_size = + sizeof(typename Gemm::KernelType::SharedStorage); + size_t const fallback_gemm_shared_mem_size = + sizeof(typename FallbackGemm::KernelType::SharedStorage); + + if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { + return cutlass_gemm_caller(out, a, b, + std::forward(args)...); + } else { + TORCH_CHECK(fallback_gemm_shared_mem_size <= + max_shared_mem_per_block_opt_in); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + template typename Epilogue> struct sm80_config_default { // This config is used in 2 cases, // - M in (128, inf) // - M in (64, 128] and N >= 8192 + // Shared Memory required by this Gemm - 81920 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -271,6 +298,7 @@ struct sm80_config_M64 { // This config is used in 2 cases, // - M in (32, 64] // - M in (64, 128] and N < 8192 + // Shared Memory required by this Gemm - 122880 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -284,6 +312,7 @@ template typename Epilogue> struct sm80_config_M32 { // M in (16, 32] + // Shared Memory required by this Gemm - 61440 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; @@ -297,6 +326,7 @@ template typename Epilogue> struct sm80_config_M16 { // M in [1, 16] + // Shared Memory required by this Gemm - 51200 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; @@ -331,35 +361,45 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, using Cutlass2xGemmM16 = typename sm80_config_M16::Cutlass2xGemm; + // Due to shared memory requirements, some Gemms may fail to run on some + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative + // in such cases. + // sm80_config_M16 has the least shared-memory requirement. However, + // based on some profiling, we select sm80_config_M32 as a better alternative + // performance wise. + using FallbackGemm = + typename sm80_config_M32::Cutlass2xGemm; + uint32_t const m = a.size(0); uint32_t const mp2 = std::max(static_cast(16), next_pow_2(m)); // next power of 2 if (mp2 <= 16) { // M in [1, 16] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 32) { // M in (16, 32] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 64) { // M in (32, 64] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 128) { // M in (64, 128] uint32_t const n = out.size(1); bool const small_n = n < 8192; if (small_n) { - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else { - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } } else { // M in (128, inf) - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } } -- GitLab From 5d4d90536fa24c032bb91ae629b7b4958e045b03 Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Sun, 23 Jun 2024 17:42:28 -0400 Subject: [PATCH 133/376] [Distributed] Add send and recv helpers (#5719) --- tests/distributed/test_comm_ops.py | 78 +++++++- tests/distributed/test_custom_all_reduce.py | 5 +- tests/distributed/test_pynccl.py | 16 +- tests/utils.py | 2 +- .../device_communicators/pynccl.py | 14 +- vllm/distributed/parallel_state.py | 187 ++++++++++++++++++ 6 files changed, 278 insertions(+), 24 deletions(-) diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 53654dc40..bf0f31df0 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -8,12 +8,11 @@ import pytest import ray import torch -from vllm.distributed import (broadcast_tensor_dict, +from vllm.distributed import (broadcast_tensor_dict, get_pp_group, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from ..utils import (init_test_distributed_environment, - multi_process_tensor_parallel) +from ..utils import init_test_distributed_environment, multi_process_parallel @ray.remote(num_gpus=1, max_calls=1) @@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, assert torch.allclose(recv_dict["f"], test_dict["f"]) +@ray.remote(num_gpus=1, max_calls=1) +def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, + distributed_init_port: str): + del os.environ["CUDA_VISIBLE_DEVICES"] + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + + test_dict = { + # device tensor + "a": torch.arange(8, dtype=torch.float32, device="cuda"), + # CPU tensor + "b": torch.arange(16, dtype=torch.int8, device="cpu"), + "c": "test", + "d": [1, 2, 3], + "e": { + "a": 1, + "b": 2 + }, + # empty tensor + "f": torch.tensor([], dtype=torch.float32, device="cuda"), + } + + if not get_pp_group().is_first_rank: + recv_dict = get_pp_group().recv_tensor_dict() + + if not get_pp_group().is_last_rank: + get_pp_group().send_tensor_dict(test_dict) + + if not get_pp_group().is_first_rank: + assert len(recv_dict) == len(test_dict) + assert torch.allclose(recv_dict["a"], test_dict["a"]) + assert torch.allclose(recv_dict["b"], test_dict["b"]) + assert recv_dict["c"] == test_dict["c"] + assert recv_dict["d"] == test_dict["d"] + assert recv_dict["e"] == test_dict["e"] + assert torch.allclose(recv_dict["f"], test_dict["f"]) + + +@ray.remote(num_gpus=1, max_calls=1) +def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, + distributed_init_port: str): + del os.environ["CUDA_VISIBLE_DEVICES"] + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + + size = 64 + test_tensor = torch.arange(64, dtype=torch.float32, device="cuda") + + if not get_pp_group().is_first_rank: + recv_tensor = get_pp_group().recv(size, dtype=torch.float32) + + if not get_pp_group().is_last_rank: + get_pp_group().send(test_tensor) + + if not get_pp_group().is_first_rank: + assert torch.allclose(test_tensor, recv_tensor) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize("tp_size", [2]) @@ -113,4 +174,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, broadcast_tensor_dict_test_worker ]) def test_multi_process_tensor_parallel(tp_size, test_target): - multi_process_tensor_parallel(tp_size, 1, test_target) + multi_process_parallel(tp_size, 1, test_target) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize("pp_size", [2]) +@pytest.mark.parametrize( + "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) +def test_multi_process_pipeline_parallel(pp_size, test_target): + multi_process_parallel(1, pp_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 9a39160b8..3c281a45f 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -12,8 +12,7 @@ from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_group, graph_capture) from ..utils import (ensure_model_parallel_initialized, - init_test_distributed_environment, - multi_process_tensor_parallel) + init_test_distributed_environment, multi_process_parallel) random.seed(42) test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] @@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") - multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target) + multi_process_parallel(tp_size, pipeline_parallel_size, test_target) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 964dbc542..e0e424439 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -168,9 +168,13 @@ def send_recv_worker_fn(): dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): if pynccl_comm.rank == 0: - pynccl_comm.send(tensor) + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % + pynccl_comm.world_size) else: - pynccl_comm.recv(tensor) + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % + pynccl_comm.world_size) result = tensor.mean().cpu().item() assert result == 1 @@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn(): device=device) with pynccl_comm.change_state(enable=True): if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.send(tensor) + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % + pynccl_comm.world_size) else: - pynccl_comm.recv(tensor) + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % + pynccl_comm.world_size) result = tensor.mean().cpu().item() if torch.distributed.get_rank() in [0, 2]: assert result == 1 diff --git a/tests/utils.py b/tests/utils.py index bc30515c8..174efca4a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -129,7 +129,7 @@ def init_test_distributed_environment( ensure_model_parallel_initialized(tp_size, pp_size) -def multi_process_tensor_parallel( +def multi_process_parallel( tp_size: int, pp_size: int, test_target, diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 83eec264b..731956654 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -121,10 +121,7 @@ class PyNcclCommunicator: ncclRedOpTypeEnum.from_torch(op), self.comm, cudaStream_t(stream.cuda_stream)) - def send(self, - tensor: torch.Tensor, - dst: Optional[int] = None, - stream=None): + def send(self, tensor: torch.Tensor, dst: int, stream=None): if self.disabled: return assert tensor.device == self.device, ( @@ -132,16 +129,11 @@ class PyNcclCommunicator: f"but the input tensor is on {tensor.device}") if stream is None: stream = self.stream - if dst is None: - dst = (self.rank + 1) % self.world_size self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, self.comm, cudaStream_t(stream.cuda_stream)) - def recv(self, - tensor: torch.Tensor, - src: Optional[int] = None, - stream=None): + def recv(self, tensor: torch.Tensor, src: int, stream=None): if self.disabled: return assert tensor.device == self.device, ( @@ -149,8 +141,6 @@ class PyNcclCommunicator: f"but the input tensor is on {tensor.device}") if stream is None: stream = self.stream - if src is None: - src = (self.rank - 1) % self.world_size self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5188fadbb..5f1decb37 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -20,6 +20,7 @@ If you only need to use the distributed environment without model/pipeline steps. """ import contextlib +import pickle from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass @@ -28,6 +29,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from unittest.mock import patch import torch +import torch.distributed from torch.distributed import Backend, ProcessGroup import vllm.envs as envs @@ -180,6 +182,16 @@ class GroupCoordinator: """Return the global rank of the last process in the group""" return self.ranks[-1] + @property + def is_first_rank(self): + """Return whether the caller is the first process in the group""" + return self.rank == self.first_rank + + @property + def is_last_rank(self): + """Return whether the caller is the last process in the group""" + return self.rank == self.last_rank + @property def next_rank(self): """Return the global rank of the process that follows the caller""" @@ -374,6 +386,70 @@ class GroupCoordinator: group=self.device_group) return obj_list + def send_object(self, obj: Any, dst: int) -> None: + """Send the input object list to the destination rank.""" + """NOTE: `dst` is the local rank of the destination rank.""" + + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + assert dst != self.rank, ( + "Invalid destination rank. Destination rank is the same " + "as the current rank.") + + # Serialize object to tensor and get the size as well + object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) + + size_tensor = torch.tensor([object_tensor.numel()], + dtype=torch.long, + device="cpu") + + # Send object size + + torch.distributed.send(size_tensor, + dst=self.ranks[dst], + group=self.cpu_group) + + # Send object + torch.distributed.send(object_tensor, + dst=self.ranks[dst], + group=self.cpu_group) + + return None + + def recv_object(self, src: int) -> Any: + """Receive the input object list from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + + assert src < self.world_size, f"Invalid src rank ({src})" + + assert src != self.rank, ( + "Invalid source rank. Source rank is the same as the current rank." + ) + + size_tensor = torch.empty(1, dtype=torch.long, device="cpu") + + # Receive object size + rank_size = torch.distributed.recv(size_tensor, + src=src, + group=self.cpu_group) + + # Tensor to receive serialized objects into. + object_tensor = torch.empty( # type: ignore[call-overload] + size_tensor.item(), # type: ignore[arg-type] + dtype=torch.uint8, + device="cpu") + + rank_object = torch.distributed.recv(object_tensor, + src=src, + group=self.cpu_group) + + assert rank_object == rank_size, ( + "Received object sender rank does not match the size sender rank.") + + obj = pickle.loads(object_tensor.numpy().tobytes()) + + return obj + def broadcast_tensor_dict( self, tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, @@ -459,6 +535,88 @@ class GroupCoordinator: async_handle.wait() return tensor_dict + def send_tensor_dict( + self, + tensor_dict: Dict[Any, Union[torch.Tensor, Any]], + dst: Optional[int] = None + ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: + """Send the input tensor dictionary. + NOTE: `dst` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + group = self.device_group + metadata_group = self.cpu_group + + if dst is None: + dst = self.next_rank + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, + dict), f"Expecting a dictionary, got {type(tensor_dict)}" + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `send_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.send_object(metadata_list, dst=dst) + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip sending empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.send(tensor, dst=dst, group=metadata_group) + else: + # use group for GPU tensors + torch.distributed.send(tensor, dst=dst, group=group) + return None + + def recv_tensor_dict( + self, + src: Optional[int] = None + ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: + """Recv the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return None + + group = self.device_group + metadata_group = self.cpu_group + + if src is None: + src = self.prev_rank + assert src < self.world_size, f"Invalid src rank ({src})" + + recv_metadata_list = self.recv_object(src=src) + tensor_dict = {} + for key, value in recv_metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.recv(tensor, + src=src, + group=metadata_group) + else: + # use group for GPU tensors + torch.distributed.recv(tensor, src=src, group=group) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + return tensor_dict + def barrier(self): """Barrier synchronization among the group. NOTE: don't use `device_group` here! `barrier` in NCCL is @@ -468,6 +626,35 @@ class GroupCoordinator: """ torch.distributed.barrier(group=self.cpu_group) + def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: + """Sends a tensor to the destination rank in a non-blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = self.next_rank + + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.send(tensor, dst) + else: + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv(self, + size: torch.Size, + dtype: torch.dtype, + src: Optional[int] = None) -> torch.Tensor: + """Receives a tensor from the src rank.""" + """NOTE: `src` is the local rank of the destination rank.""" + if src is None: + src = self.prev_rank + + tensor = torch.empty(size, dtype=dtype, device=self.device) + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.recv(tensor, src) + else: + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + def destroy(self): if self.device_group is not None: torch.distributed.destroy_process_group(self.device_group) -- GitLab From edd5fe5fa29b8f9cc5fa37a30cc7211e0ff37067 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 12:11:53 +0800 Subject: [PATCH 134/376] [Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772) --- requirements-cpu.txt | 1 + requirements-cuda.txt | 2 + requirements-test.txt | 1 - tests/models/test_phi3v.py | 4 ++ vllm/model_executor/models/phi3v.py | 69 +++++++++++++++++++++++++++-- 5 files changed, 72 insertions(+), 5 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 8b7d86e68..21acee91d 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -3,4 +3,5 @@ # Dependencies for x86_64 CPUs torch == 2.3.1+cpu +torchvision == 0.18.1+cpu # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 353617983..10596ed85 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,5 +5,7 @@ ray >= 2.9 nvidia-ml-py # for pynvml package torch == 2.3.0 +# These must be updated alongside torch +torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 diff --git a/requirements-test.txt b/requirements-test.txt index fef0ede7b..8b68e0e93 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -14,7 +14,6 @@ peft requests ray sentence-transformers # required for embedding -torchvision # required for the image processor of phi3v # Benchmarking aiohttp diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 234547598..a29d50df4 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -22,6 +22,7 @@ assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) def iter_phi3v_configs(model_name: str): image_hw_to_feature_size = { (1008, 1344): 1921, + (2016, 2688): 1933, } for (h, w), f in image_hw_to_feature_size.items(): @@ -75,6 +76,9 @@ if is_cpu(): # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] # Since we use _attn_implementation="eager" for hf_runner, here is # numeric difference for longer context and test can't pass +@pytest.mark.xfail( + reason="Inconsistent image processor being used due to lack " + "of support for dynamic image token replacement") @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index fa20a7c59..dac832a68 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -13,14 +13,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict +import numpy as np import torch import torch.nn as nn +from PIL import Image from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig +from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -32,9 +35,11 @@ from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import get_dummy_image_data +from vllm.multimodal.image import ImagePixelData, get_dummy_image_data from vllm.sequence import SamplerOutput +logger = init_logger(__name__) + _KEYS_TO_MODIFY_MAPPING = { "model.vision_embed_tokens": "vision_embed_tokens", } @@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" -@MULTIMODAL_REGISTRY.register_image_pixel_input() +# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_padded_size(width, height, padding_unit=336): + target_height = int(np.ceil(height / padding_unit) * padding_unit) + top_padding = int((target_height - height) / 2) + bottom_padding = target_height - height - top_padding + padded_width = width + padded_height = height + top_padding + bottom_padding + return padded_width, padded_height + + +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_hd_transform_size(width, height, hd_num=16): + transposed = False + if width < height: + width, height = height, width + transposed = True + + ratio = width / height + scale = 1 + while scale * np.ceil(scale / ratio) <= hd_num: + scale += 1 + scale -= 1 + + new_width = int(scale * 336) + new_height = int(new_width / ratio) + + padded_width, padded_height = calc_padded_size(new_width, new_height) + + if transposed: + padded_width, padded_height = padded_height, padded_width + + return padded_width, padded_height + + +def _image_processor( + data: ImagePixelData, + model_config: ModelConfig, + vlm_config: VisionLanguageConfig, +) -> Dict[str, torch.Tensor]: + image = data.image + + if isinstance(image, Image.Image): + # Temporary patch before dynamic number of image tokens is supported + _, _, h, w = vlm_config.image_input_shape + if (w, h) != calc_hd_transform_size(image.width, image.height): + logger.warning( + "Dynamic image shape is currently not supported. " + "Resizing input image to (%d, %d).", w, h) + + data.image = image.resize((w, h)) + + return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ + ._default_input_processor(data, model_config, vlm_config) + + +@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class Phi3VForCausalLM(VisionLanguageModelBase): -- GitLab From c2462129521a64b62ace77b28641d2e3bec5831c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 24 Jun 2024 00:37:42 -0700 Subject: [PATCH 135/376] [doc][faq] add warning to download models for every nodes (#5783) --- docs/source/serving/distributed_serving.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index b0c45dbf7..2a7937a91 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -35,4 +35,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime -After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. \ No newline at end of file +After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. + +.. warning:: + Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes. -- GitLab From e72dc6cb3507d914eec8dfd0d5c7b9478f6a8ccc Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 24 Jun 2024 13:26:17 -0400 Subject: [PATCH 136/376] [Doc] Add "Suggest edit" button to doc pages (#5789) --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index ca26dcec4..af1f22b23 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,7 @@ html_theme_options = { 'path_to_docs': 'docs/source', 'repository_url': 'https://github.com/vllm-project/vllm', 'use_repository_button': True, + 'use_edit_page_button': True, } # Add any paths that contain custom static files (such as style sheets) here, -- GitLab From 1744cc99ba9bdefea8f3f798cf51ed650b81a98e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 24 Jun 2024 13:48:55 -0400 Subject: [PATCH 137/376] [Doc] Add Phi-3-medium to list of supported models (#5788) --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index f4673dc27..47737ae52 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -129,7 +129,7 @@ Alongside each architecture, we include some popular models that use it. - ✅︎ * - :code:`Phi3ForCausalLM` - Phi-3 - - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc. + - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. - * - :code:`Phi3SmallForCausalLM` - Phi-3-Small -- GitLab From ba991d5c84adbc0685075af88333c688ddb06011 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Mon, 24 Jun 2024 16:01:19 -0700 Subject: [PATCH 138/376] [Bugfix] Fix FlexibleArgumentParser replaces _ with - for actual args (#5795) --- vllm/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index ce5c377ef..f0c7df5cf 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -822,7 +822,13 @@ class FlexibleArgumentParser(argparse.ArgumentParser): processed_args = [] for arg in args: if arg.startswith('--'): - processed_args.append('--' + arg[len('--'):].replace('_', '-')) + if '=' in arg: + key, value = arg.split('=', 1) + key = '--' + key[len('--'):].replace('_', '-') + processed_args.append(f'{key}={value}') + else: + processed_args.append('--' + + arg[len('--'):].replace('_', '-')) else: processed_args.append(arg) -- GitLab From e9de9dd551ac595a9f3825fcd1507deceef4f332 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 24 Jun 2024 21:09:02 -0700 Subject: [PATCH 139/376] [ci] Remove aws template (#5757) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 7 +- .buildkite/test-template-aws.j2 | 145 -------------------------------- 2 files changed, 5 insertions(+), 147 deletions(-) delete mode 100644 .buildkite/test-template-aws.j2 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0b87e6280..19b1bce16 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,7 +1,10 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template-aws.j2` to generate -# the final pipeline yaml file. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + steps: - label: Regression Test diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 deleted file mode 100644 index 1a7fb44c2..000000000 --- a/.buildkite/test-template-aws.j2 +++ /dev/null @@ -1,145 +0,0 @@ -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - priority: 100 - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel CPU Test" - depends_on: ~ - agents: - queue: intel-cpu - command: bash .buildkite/run-cpu-test.sh - - - label: "Intel GPU Test" - depends_on: ~ - agents: - queue: intel-gpu - command: bash .buildkite/run-xpu-test.sh - - {% for step in steps %} - {% if step.gpu == "a100" %} - - label: "{{ step.label }}" - agents: - queue: a100-queue - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: {{ docker_image }} - command: ["bash"] - args: - - '-c' - - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" - resources: - limits: - nvidia.com/gpu: {{ step.num_gpus or 1 }} - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - {% else %} - - label: "{{ step.label }}" - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue - {% elif step.no_gpu %} - queue: cpu_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - docker#v5.2.0: - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_TOKEN - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - {% endif %} - {% endfor %} -- GitLab From f23871e9eead900d6146961ca894f5bc91f30f5e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 25 Jun 2024 16:25:03 +0800 Subject: [PATCH 140/376] [Doc] Add notice about breaking changes to VLMs (#5818) --- docs/source/models/vlm.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 70ac82e20..de55a1a09 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -5,6 +5,9 @@ Using VLMs vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM. +.. important:: + We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation. + Engine Arguments ---------------- @@ -39,6 +42,10 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` image_feature_size=576, ) +.. important:: + We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration. + + To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. @@ -63,6 +70,9 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS A code example can be found in `examples/llava_example.py `_. +.. important:: + We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model. + Online OpenAI Vision API Compatible Inference ---------------------------------------------- @@ -89,6 +99,9 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with --image-feature-size 576 \ --chat-template template_llava.jinja +.. important:: + We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration. + To consume the server, you can use the OpenAI client like in the example below: .. code-block:: python -- GitLab From 2ce5d6688bae64e467640b05e73af2888e93afcf Mon Sep 17 00:00:00 2001 From: Woo-Yeon Lee Date: Tue, 25 Jun 2024 18:56:06 +0900 Subject: [PATCH 141/376] [Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) --- .buildkite/test-pipeline.yaml | 3 +- benchmarks/benchmark_latency.py | 6 + .../e2e/test_integration_dist_tp2.py | 111 +++++++++++++ ...n_dist.py => test_integration_dist_tp4.py} | 41 +++-- vllm/config.py | 24 ++- vllm/distributed/parallel_state.py | 76 ++++++--- vllm/engine/arg_utils.py | 10 ++ vllm/spec_decode/multi_step_worker.py | 11 +- vllm/spec_decode/proposer_worker_base.py | 4 +- .../spec_decode/smaller_tp_proposer_worker.py | 149 ++++++++++++++++++ vllm/spec_decode/spec_decode_worker.py | 12 +- 11 files changed, 388 insertions(+), 59 deletions(-) create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (62%) create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 19b1bce16..10cfe35d8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -54,7 +54,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py @@ -71,6 +71,7 @@ steps: # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Engine Test mirror_hardwares: [amd] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index a4cf0632b..f3d00e456 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -25,6 +25,8 @@ def main(args: argparse.Namespace): model=args.model, speculative_model=args.speculative_model, num_speculative_tokens=args.num_speculative_tokens, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, @@ -127,6 +129,10 @@ if __name__ == '__main__': parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py new file mode 100644 index 000000000..5534b80c0 --- /dev/null +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -0,0 +1,111 @@ +"""Tests which cover integration of the speculative decoding framework with +tensor parallelism. +""" + +import pytest +import torch + +from vllm.utils import is_hip + +from .conftest import run_greedy_equality_correctness_test + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when tensor parallelism is used. + """ + if is_hip(): + pytest.skip("hip is not well-supported yet") + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py similarity index 62% rename from tests/spec_decode/e2e/test_integration_dist.py rename to tests/spec_decode/e2e/test_integration_dist_tp4.py index d444ef24c..56cb0147d 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -5,16 +5,16 @@ tensor parallelism. import pytest import torch -from vllm.utils import is_hip - from .conftest import run_greedy_equality_correctness_test -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") @pytest.mark.parametrize( "common_llm_kwargs", [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", # Skip cuda graph recording for fast test. @@ -22,7 +22,7 @@ from .conftest import run_greedy_equality_correctness_test # Required for spec decode. "use_v2_block_manager": True, - "tensor_parallel_size": 2, + "tensor_parallel_size": 4, # Use AsyncLLM engine, so that the engine runs in its own process. # Otherwise, since vLLM does not follow true SPMD, the test runner @@ -31,35 +31,30 @@ from .conftest import run_greedy_equality_correctness_test # second run of the test to fail with internal NCCL error. "use_async": True, }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }, - { - "speculative_model": "[ngram]", "num_speculative_tokens": 5, - "ngram_prompt_lookup_max": 3, }, ]) -@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize( - "output_len", + "test_llm_kwargs", [ - # Use smaller output len for fast test. - 32, + #TODO(wooyeon): add spec_draft_dp=2 case + { + "speculative_draft_tensor_parallel_size": 1, + }, ]) +@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) -def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when tensor parallelism is used. +def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. """ - if is_hip(): - pytest.skip("hip is not well-supported yet") run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, - max_output_len=output_len, + max_output_len=32, force_output_len=True) diff --git a/vllm/config.py b/vllm/config.py index 8d004902f..0217a2b56 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -797,6 +797,7 @@ class SpeculativeConfig: target_parallel_config: ParallelConfig, target_dtype: str, speculative_model: Optional[str], + speculative_draft_tensor_parallel_size: Optional[int], num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], enable_chunked_prefill: bool, @@ -819,6 +820,8 @@ class SpeculativeConfig: target_dtype (str): The data type used for the target model. speculative_model (Optional[str]): The name of the speculative model, if provided. + speculative_draft_tensor_parallel_size (Optional[int]): The degree + of the tensor parallelism for the draft model. num_speculative_tokens (Optional[int]): The number of speculative tokens, if provided. Will default to the number in the draft model config if present, otherwise is required. @@ -939,7 +942,8 @@ class SpeculativeConfig: draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( - target_parallel_config)) + target_parallel_config, + speculative_draft_tensor_parallel_size)) if num_speculative_tokens is None: raise ValueError( @@ -993,16 +997,26 @@ class SpeculativeConfig: @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig) -> ParallelConfig: + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int] + ) -> ParallelConfig: """Create a parallel config for use by the draft worker. - This is mostly a copy of the target parallel config. In the future the - draft worker can have a different parallel strategy, e.g. TP=1. + This is mostly a copy of the target parallel config, except the tp_size. """ + if speculative_draft_tensor_parallel_size is None: + speculative_draft_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size + elif speculative_draft_tensor_parallel_size != 1: + # TODO(wooyeon): allow tp values larger than 1 + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be" + f"other value than 1") + draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. pipeline_parallel_size, - tensor_parallel_size=target_parallel_config.tensor_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5f1decb37..a7a806b05 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -676,6 +676,28 @@ def get_world_group() -> GroupCoordinator: return _WORLD +def init_world_group(ranks: List[int], local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + + +def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=group_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) + + _TP: Optional[GroupCoordinator] = None @@ -764,13 +786,7 @@ def init_distributed_environment( global _WORLD if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) - _WORLD = GroupCoordinator( - group_ranks=[ranks], - local_rank=local_rank, - torch_distributed_backend=backend, - use_pynccl=False, - use_custom_allreduce=False, - ) + _WORLD = init_world_group(ranks, local_rank, backend) else: assert _WORLD.world_size == torch.distributed.get_world_size(), ( "world group already initialized with a different world size") @@ -827,13 +843,8 @@ def initialize_model_parallel( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) group_ranks.append(ranks) - _TP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) # Build the pipeline model-parallel groups. num_pipeline_model_parallel_groups: int = (world_size // @@ -845,13 +856,8 @@ def initialize_model_parallel( for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) group_ranks.append(ranks) - _PP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _PP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) def ensure_model_parallel_initialized( @@ -887,6 +893,34 @@ def model_parallel_is_initialized(): return (_TP is not None and _PP is not None) +_TP_STATE_PATCHED = False + + +@contextmanager +def patch_tensor_parallel_group(tp_group: GroupCoordinator): + """Patch the tp group temporarily until this function ends. + + This method is for draft workers of speculative decoding to run draft model + with different tp degree from that of target model workers. + + Args: + tp_group (GroupCoordinator): the tp group coordinator + """ + global _TP_STATE_PATCHED + assert not _TP_STATE_PATCHED, "Should not call when it's already patched" + + _TP_STATE_PATCHED = True + old_tp_group = get_tp_group() + global _TP + _TP = tp_group + try: + yield + finally: + # restore the original state + _TP_STATE_PATCHED = False + _TP = old_tp_group + + def get_tensor_model_parallel_world_size(): """Return world size for the tensor model parallel group.""" return get_tp_group().world_size diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ef3161242..16374098b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -94,6 +94,7 @@ class EngineArgs: guided_decoding_backend: str = 'outlines' # Speculative decoding configuration. speculative_model: Optional[str] = None + speculative_draft_tensor_parallel_size: Optional[int] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None @@ -537,6 +538,13 @@ class EngineArgs: default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') + parser.add_argument( + '--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=EngineArgs.speculative_draft_tensor_parallel_size, + help='Number of tensor parallel replicas for ' + 'the draft model in speculative decoding.') parser.add_argument( '--speculative-max-model-len', @@ -686,6 +694,8 @@ class EngineArgs: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, + speculative_draft_tensor_parallel_size = \ + self.speculative_draft_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_by_batch_size=self. speculative_disable_by_batch_size, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 668ceefe6..e469fd7c3 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,8 @@ import torch from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -28,9 +29,9 @@ class MultiStepWorker(Worker, ProposerWorkerBase): super().__init__(*args, **kwargs) # Lazy initialization list. - self._proposer: Top1Proposer + self._proposer: SpeculativeProposer - def init_device(self): + def init_device(self) -> None: super().init_device() self._proposer = Top1Proposer( @@ -40,7 +41,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): max_proposal_len=self.max_model_len, ) - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True @@ -73,7 +74,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): # Run model sample_len times. model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): - model_output = super().execute_model( + model_output: List[SamplerOutput] = super().execute_model( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 ), "composing multistep workers not supported" diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index fd67ceb91..b691659fb 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -3,10 +3,10 @@ from typing import List, Optional, Tuple from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class ProposerWorkerBase(WorkerBase, SpeculativeProposer): +class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer): """Interface for proposer workers""" @abstractmethod diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py new file mode 100644 index 000000000..b78e44895 --- /dev/null +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -0,0 +1,149 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.distributed.parallel_state import (get_tp_group, + init_model_parallel_group, + patch_tensor_parallel_group) +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase + +logger = init_logger(__name__) + + +class SmallerTpProposerWorker(ProposerWorkerBase): + """Class which allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. + + To implement this feature, this class differs behavior based on is_dummy + flag, where dummy means worker that does not participate draft generation. + Participating workers use a smaller tp group by patching vLLM's tensor + parallel group temporarily during forward passes of draft models. + """ + + @classmethod + def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, + target_tensor_parallel_size: int): + """Wrap the worker in a SmallerTpProposerWorker if necessary. + """ + if draft_tensor_parallel_size == target_tensor_parallel_size: + return worker + + # gpu ranks that will generate draft tokens together + draft_ranks = list(range(draft_tensor_parallel_size)) + + logger.info("Wrapping {%s} in {%s}", type(worker), cls) + return cls(worker, draft_ranks) + + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): + """Create a SmallerTpProposerWorker. + + Args: + worker (MultiStepWorker): an actual worker wrapped with this class + draft_ranks (List[int]): if this value is given, only the GPU ranks + written in this value participate in draft generation + """ + self._worker = worker + self._draft_ranks = draft_ranks + + # init during init_device + self._is_dummy = False + self._tp_group = None + + def _patch_tensor_parallel_group(self): + """Temporarily patch the global tp group state with its own tp group + state. + """ + return patch_tensor_parallel_group(self._tp_group) + + def init_device(self) -> None: + self._is_dummy = get_tp_group().rank not in self._draft_ranks + + # dummy workers do nothing + if self._is_dummy: + return + + # creates tp process group containing only a subset of gpu ranks + local_rank = get_tp_group().local_rank + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + self._tp_group = init_model_parallel_group([self._draft_ranks], + local_rank, tp_backend) + + with self._patch_tensor_parallel_group(): + self._worker.init_device() + + def set_include_gpu_probs_tensor(self) -> None: + if self._is_dummy: + return + + # Need include_gpu_probs_tensor for multi_step_worker + self._worker.set_include_gpu_probs_tensor() + + def load_model(self) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + if self._is_dummy: + # this case is not used now + return -1, -1 + + with self._patch_tensor_parallel_group(): + return self._worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + # Do not check _is_dummy, as it's always called by get_spec_proposals + return self._worker.sampler_output(execute_model_req, sample_len) + + def get_spec_proposals( + self, + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + if self._is_dummy: + return SpeculativeProposals(None, None, None) + + with self._patch_tensor_parallel_group(): + return self._worker.get_spec_proposals(execute_model_req) + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if self._is_dummy: + return [] + + with self._patch_tensor_parallel_group(): + return self._worker.execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + if self._is_dummy: + # by returning zero, target worker can use the entire kv cache space + return 0 + + return self._worker.get_cache_block_size_bytes() + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 58d3461a2..5089e3dd5 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple import torch -from vllm.config import SpeculativeConfig +from vllm.config import ParallelConfig, SpeculativeConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler @@ -18,6 +18,7 @@ from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.util import (create_sequence_group_output, get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, @@ -90,7 +91,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: WorkerBase, + scorer_worker: Worker, draft_worker_kwargs: Dict[str, Any], disable_by_batch_size: Optional[int], ) -> "SpecDecodeWorker": @@ -111,7 +112,14 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) disable_bonus_tokens = False else: + draft_parallel_config: ParallelConfig = draft_worker_kwargs[ + 'parallel_config'] + draft_tp = draft_parallel_config.tensor_parallel_size + target_tp = scorer_worker.parallel_config.tensor_parallel_size + proposer_worker = MultiStepWorker(**draft_worker_kwargs) + proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( + proposer_worker, draft_tp, target_tp) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) -- GitLab From 7b993143014c95844b380a5b05eebd14ad77b7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 26 Jun 2024 00:41:36 +0800 Subject: [PATCH 142/376] [Misc] Remove useless code in cpu_worker (#5824) --- vllm/worker/cpu_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3ee394f99..914df0c7d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -277,7 +277,6 @@ class CPUWorker(LoraNotSupportedWorkerBase): assert seq_group_metadata_list is not None num_seq_groups: int = len(seq_group_metadata_list) assert execute_model_req is not None - blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, device="cpu", dtype=torch.int64).view(-1, 2) -- GitLab From 67882dbb44186d781ab6db9eaec08f6616dc86bd Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 25 Jun 2024 10:15:10 -0700 Subject: [PATCH 143/376] [Core] Add fault tolerance for `RayTokenizerGroupPool` (#5748) --- tests/tokenization/test_tokenizer_group.py | 99 ++++++++++++++++ vllm/engine/async_llm_engine.py | 2 + vllm/engine/llm_engine.py | 2 + .../tokenizer_group/base_tokenizer_group.py | 4 + .../tokenizer_group/ray_tokenizer_group.py | 112 ++++++++++++++---- 5 files changed, 195 insertions(+), 24 deletions(-) diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 31571dbff..1b9a59075 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,5 +1,7 @@ import asyncio import os +import sys +from typing import List, Optional from unittest.mock import patch import pytest @@ -100,3 +102,100 @@ async def test_tokenizer_group_ray_pool_env_var_propagation( max_num_seqs=1, max_input_length=None) tokenizer_pool.ping() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", ["ray"]) +async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): + """Test that Ray tokenizer pool group can recover from failures and + if that's not possible, mark itself as unhealthy.""" + + class FailingTokenizerGroup(TokenizerGroup): + + def __init__(self, + *args, + fail_at: Optional[List[int]] = None, + **kwargs): + super().__init__(*args, **kwargs) + self.i = 0 + self.fail_at = fail_at or [] + + def encode(self, *args, **kwargs): + self.i += 1 + if self.i in self.fail_at: + sys.exit(1) + return super().encode(*args, **kwargs) + + class FailingRayTokenizerGroupPool(RayTokenizerGroupPool): + _worker_cls = FailingTokenizerGroup + + # Fail at first iteration + fail_at = [1] + tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + fail_at=fail_at) + tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() + + # Modify fail at to not fail at all (will be re-read when actor is + # re-initialized). + fail_at[0] = 1000 + + # We should recover successfully. + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + + # Check that we have a new actor + assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors) + assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors + + # Fail at first iteration + fail_at = [1] + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + fail_at=fail_at) + + # We should fail after re-initialization. + with pytest.raises(RuntimeError): + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + + # check_health should raise the same thing + with pytest.raises(RuntimeError): + tokenizer_group_pool.check_health() + + # Ensure that non-ActorDiedErrors are still propagated correctly and do not + # cause a re-initialization. + fail_at = [] + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=2, + fail_at=fail_at) + tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() + + # Prompt too long error + with pytest.raises(ValueError): + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt" * 100, + lora_request=None) + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + # Actors should stay the same. + assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index df25eb111..7994b873f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -310,6 +310,8 @@ class _AsyncLLMEngine(LLMEngine): ) async def check_health_async(self) -> None: + if self.tokenizer: + self.tokenizer.check_health() self.model_executor.check_health() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f7eae257f..0ad957ef9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1013,6 +1013,8 @@ class LLMEngine: return self.model_executor.pin_lora(lora_id) def check_health(self) -> None: + if self.tokenizer: + self.tokenizer.check_health() self.model_executor.check_health() def is_tracing_enabled(self) -> bool: diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index 3cce96e06..18fbd894f 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -53,3 +53,7 @@ class BaseTokenizerGroup(ABC): ) -> "PreTrainedTokenizer": """Get a tokenizer for a LoRA request.""" pass + + def check_health(self): + """Raise exception if the tokenizer group is unhealthy.""" + return diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 7c6054168..21ec2b52b 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -2,17 +2,21 @@ import asyncio import os from typing import List, Optional +from ray.exceptions import ActorDiedError from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from transformers import PreTrainedTokenizer from vllm.config import TokenizerPoolConfig from vllm.executor.ray_utils import ray +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( TokenizerGroup) +logger = init_logger(__name__) + class RayTokenizerGroupPool(BaseTokenizerGroup): """A Ray-based pool of TokenizerGroups for async tokenization.""" @@ -46,24 +50,28 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): ray_actor_options: dict, **tokenizer_config): # Store a local copy of the TokenizerGroup for quick access # to underlying HF tokenizers. + self._tokenizer_config = { + "tokenizer_id": tokenizer_id, + "enable_lora": enable_lora, + "max_num_seqs": max_num_seqs, + "max_input_length": max_input_length, + **tokenizer_config + } self._local_tokenizer_group = self._worker_cls( - tokenizer_id=tokenizer_id, - enable_lora=enable_lora, - max_num_seqs=max_num_seqs, - max_input_length=max_input_length, - **tokenizer_config, - ) - - ray_tokenizer_group_cls = ray.remote( + **self._tokenizer_config, ) + + self._ray_tokenizer_group_cls = ray.remote( self._worker_cls).options(**ray_actor_options) - self.tokenizer_actors = [ - ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora, - max_num_seqs, max_input_length, - **tokenizer_config) - for _ in range(num_actors) - ] + self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)] self._idle_actors: Optional[asyncio.Queue] = None + # If set, actor is unhealthy. Will reraise on the next + # check_health call. + self._exception: Optional[ActorDiedError] = None + + def _init_actor(self) -> ray.ObjectRef: + return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config) + @property def pool_size(self) -> int: return len(self.tokenizer_actors) @@ -78,6 +86,22 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): for actor in self.tokenizer_actors: self._idle_actors.put_nowait(actor) + def _finalize_encode(self, actor: ray.ObjectRef, + original_actor: ray.ObjectRef, actor_is_alive: bool): + assert self._idle_actors is not None + # Cleanup the dead actor. + if not actor_is_alive or original_actor is not actor: + self.tokenizer_actors.remove(original_actor) + if actor_is_alive: + # Put the actor back in the queue. + # This is done in a finally block to ensure that the actor is + # always put back in the queue, even if an exception/cancellation + # is raised. + self._idle_actors.put_nowait(actor) + # Add back the new actor. + if original_actor is not actor: + self.tokenizer_actors.append(actor) + def encode(self, prompt: str, request_id: Optional[str] = None, @@ -88,23 +112,41 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): The actor is then put back in the queue for future use. This is blocking. """ + self.check_health() self._ensure_queue_initialized() assert self._idle_actors is not None if self._idle_actors.empty(): raise RuntimeError("No idle actors available.") actor = self._idle_actors.get_nowait() + actor_is_alive = True + original_actor = actor try: ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, lora_request=lora_request)) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = ray.get( + actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request)) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() finally: - # Put the actor back in the queue. - # This is done in a finally block to ensure that the actor is - # always put back in the queue, even if an exception/cancellation - # is raised. - self._idle_actors.put_nowait(actor) + self._finalize_encode(actor, original_actor, actor_is_alive) return ret async def encode_async( @@ -120,20 +162,37 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): The actor is then put back in the queue for future use. This is non-blocking. """ + self.check_health() self._ensure_queue_initialized() assert self._idle_actors is not None actor = await self._idle_actors.get() + actor_is_alive = True + original_actor = actor try: ret = await actor.encode.remote(request_id=request_id, prompt=prompt, lora_request=lora_request) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = await actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() finally: - # Put the actor back in the queue. - # This is done in a finally block to ensure that the actor is - # always put back in the queue, even if an exception/cancellation - # is raised. - self._idle_actors.put_nowait(actor) + self._finalize_encode(actor, original_actor, actor_is_alive) return ret def get_max_input_len(self, @@ -155,6 +214,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): return await self._local_tokenizer_group.get_lora_tokenizer_async( lora_request) + def check_health(self): + if self._exception: + raise RuntimeError( + "TokenizerGroupPool is unhealthy.") from self._exception + def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None: """Copy over all current process environment variables to the runtime_env. -- GitLab From c18ebfdd71d16eb18617676b0b1d82ebde0027f0 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 25 Jun 2024 12:10:28 -0700 Subject: [PATCH 144/376] [doc][distributed] add both gloo and nccl tests (#5834) --- docs/source/getting_started/debugging.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index a22bba147..4cd34769e 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -28,8 +28,8 @@ If it crashes, and the error trace shows somewhere around ``self.graph.replay()` Here are some common issues that can cause hangs: -- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. -- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly. +- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address. +- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly. .. code-block:: python @@ -41,7 +41,14 @@ Here are some common issues that can cause hangs: dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.cuda.synchronize() value = data.mean().item() - assert value == dist.get_world_size() + world_size = dist.get_world_size() + assert value == world_size, f"Expected {world_size}, got {value}" + + gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") + cpu_data = torch.FloatTensor([1,] * 128) + dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) + value = cpu_data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" .. tip:: -- GitLab From d9b34baeddc7f48a526dc610429a3c8670b3b339 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Jun 2024 15:18:03 -0400 Subject: [PATCH 145/376] [CI/Build] Add unit testing for FlexibleArgumentParser (#5798) --- tests/test_utils.py | 61 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 0b674ea6a..8203b5d2f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,8 @@ from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol, import pytest -from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators +from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs, + get_open_port, merge_async_iterators) from .utils import error_on_warning @@ -130,3 +131,61 @@ def test_get_open_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: s3.bind(("localhost", get_open_port())) os.environ.pop("VLLM_PORT") + + +# Tests for FlexibleArgumentParser +@pytest.fixture +def parser(): + parser = FlexibleArgumentParser() + parser.add_argument('--image-input-type', + choices=['pixel_values', 'image_features']) + parser.add_argument('--model-name') + parser.add_argument('--batch-size', type=int) + parser.add_argument('--enable-feature', action='store_true') + return parser + + +def test_underscore_to_dash(parser): + args = parser.parse_args(['--image_input_type', 'pixel_values']) + assert args.image_input_type == 'pixel_values' + + +def test_mixed_usage(parser): + args = parser.parse_args([ + '--image_input_type', 'image_features', '--model-name', + 'facebook/opt-125m' + ]) + assert args.image_input_type == 'image_features' + assert args.model_name == 'facebook/opt-125m' + + +def test_with_equals_sign(parser): + args = parser.parse_args( + ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m']) + assert args.image_input_type == 'pixel_values' + assert args.model_name == 'facebook/opt-125m' + + +def test_with_int_value(parser): + args = parser.parse_args(['--batch_size', '32']) + assert args.batch_size == 32 + args = parser.parse_args(['--batch-size', '32']) + assert args.batch_size == 32 + + +def test_with_bool_flag(parser): + args = parser.parse_args(['--enable_feature']) + assert args.enable_feature is True + args = parser.parse_args(['--enable-feature']) + assert args.enable_feature is True + + +def test_invalid_choice(parser): + with pytest.raises(SystemExit): + parser.parse_args(['--image_input_type', 'invalid_choice']) + + +def test_missing_required_argument(parser): + parser.add_argument('--required-arg', required=True) + with pytest.raises(SystemExit): + parser.parse_args([]) -- GitLab From dd248f76756adba4a1637b882e79ab639f957feb Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 25 Jun 2024 15:23:35 -0400 Subject: [PATCH 146/376] [Misc] Update `w4a16` `compressed-tensors` support to include `w8a16` (#5794) --- tests/quantization/test_compressed_tensors.py | 23 +++++++-------- .../compressed_tensors/compressed_tensors.py | 28 +++++++++++-------- .../compressed_tensors/schemes/__init__.py | 5 ++-- .../schemes/compressed_tensors_w4a16_24.py | 1 + ...s_w4a16.py => compressed_tensors_wNa16.py} | 5 ++-- 5 files changed, 36 insertions(+), 26 deletions(-) rename vllm/model_executor/layers/quantization/compressed_tensors/schemes/{compressed_tensors_w4a16.py => compressed_tensors_wNa16.py} (98%) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index aaa366335..6eb7ff72f 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -8,9 +8,9 @@ import torch from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsLinearMethod, CompressedTensorsW4A16, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor, + CompressedTensorsWNA16) @pytest.mark.parametrize("model_args", [ @@ -74,26 +74,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): assert qkv_proj.weight.dtype is torch.int8 -@pytest.mark.parametrize("w4a16_args", [ - ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None), - ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128), -]) -def test_compressed_tensors_w4a16(vllm_runner, w4a16_args): - model, strategy, group = w4a16_args +@pytest.mark.parametrize( + "wNa16_args", + [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8), + ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8), + ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)]) +def test_compressed_tensors_w4a16(vllm_runner, wNa16_args): + model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) assert qkv_proj.scheme.strategy == strategy assert qkv_proj.scheme.group_size == group assert qkv_proj.weight_packed.dtype is torch.int32 assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.weight_packed.pack_factor == 8 + assert qkv_proj.weight_packed.pack_factor == pack_factor def test_compressed_tensors_w4a16_marlin24(vllm_runner): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 44dd024af..c69e2f3bc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -7,9 +7,10 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme, CompressedTensorsW4A16, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, + CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor, + CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) @@ -108,26 +109,31 @@ class CompressedTensorsConfig(QuantizationConfig): return is_8_bits and is_token and is_symmetric and is_dynamic - def _is_w4a16(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_wNa16_group_channel(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: input_quant_none = input_quant is None - is_4_bits = weight_quant.num_bits == 4 is_symmetric = weight_quant.symmetric + is_channel_group = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value) is_static = not weight_quant.dynamic - return is_4_bits and input_quant_none and is_symmetric and is_static + return (is_channel_group and input_quant_none and is_symmetric + and is_static) def _get_schema(self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": - if self._is_w4a16(weight_quant, input_quant): - if self.quant_format == CompressionFormat.marlin_24.value: + if self._is_wNa16_group_channel(weight_quant, input_quant): + if (self.quant_format == CompressionFormat.marlin_24.value + and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS): return CompressedTensorsW4A16Sparse24( strategy=weight_quant.strategy, num_bits=weight_quant.num_bits, group_size=weight_quant.group_size) - if self.quant_format == CompressionFormat.pack_quantized.value: - return CompressedTensorsW4A16( + if (self.quant_format == CompressionFormat.pack_quantized.value + and weight_quant.num_bits in WNA16_SUPPORTED_BITS): + return CompressedTensorsWNA16( num_bits=weight_quant.num_bits, strategy=weight_quant.strategy, group_size=weight_quant.group_size) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 3c95aa11f..f6d20ce2c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,10 +1,11 @@ from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 from .compressed_tensors_unquantized import ( # noqa: F401 CompressedTensorsUnquantized) -from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401 from .compressed_tensors_w4a16_24 import ( # noqa: F401 - CompressedTensorsW4A16Sparse24) + W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 CompressedTensorsW8A8DynamicToken) from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 CompressedTensorsW8A8StaticTensor) +from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS # noqa: F401 +from .compressed_tensors_wNa16 import CompressedTensorsWNA16 # noqa: F401 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index d7e04ddb8..607029c81 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW4A16Sparse24"] +W4A16SPARSE24_SUPPORTED_BITS = [4] class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py similarity index 98% rename from vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 373458cff..7707ea6ee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -11,10 +11,11 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( marlin_permute_scales) from vllm.model_executor.utils import set_weight_attrs -__all__ = ["CompressedTensorsW4A16"] +__all__ = ["CompressedTensorsWNA16"] +WNA16_SUPPORTED_BITS = [4, 8] -class CompressedTensorsW4A16(CompressedTensorsScheme): +class CompressedTensorsWNA16(CompressedTensorsScheme): def __init__(self, strategy: str, -- GitLab From bc34937d68e9715d8416457539fb528301cf6269 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 25 Jun 2024 15:25:52 -0700 Subject: [PATCH 147/376] [Hardware][TPU] Refactor TPU backend (#5831) --- vllm/executor/tpu_executor.py | 58 +++++++++++++++++++++------------ vllm/worker/tpu_model_runner.py | 4 +++ vllm/worker/tpu_worker.py | 35 +++++++++++++------- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 5ed00e137..7fe5349c9 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -1,4 +1,4 @@ -from typing import List, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import torch @@ -26,29 +26,45 @@ class TPUExecutor(ExecutorBase): self.model_config.dtype = torch.bfloat16 # Instantiate the worker and load the model to the device. - self._init_worker() - - def _init_worker(self): - from vllm.worker.tpu_worker import TPUWorker + self.driver_worker = self._create_worker() + self.driver_worker.init_device() + self.driver_worker.load_model() - assert self.parallel_config.world_size == 1, ( - "TPUExecutor currently only supports a single TPU chip.") - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = TPUWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - self.cache_config, - self.load_config, - self.vision_language_config, - local_rank=0, - rank=0, + def _get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None, + ) -> Dict[str, Any]: + """Return worker init args for a given rank.""" + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return dict( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, distributed_init_method=distributed_init_method, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, ) - self.driver_worker.init_device() - self.driver_worker.load_model() + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None, + ): + from vllm.worker.tpu_worker import TPUWorker + + worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return worker def initialize_cache( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 5003d3b0c..2d8fffe5a 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -33,6 +33,7 @@ class TPUModelRunner: cache_config: CacheConfig, load_config: LoadConfig, vision_language_config: Optional[VisionLanguageConfig] = None, + is_driver_worker: bool = False, ): self.model_config = model_config self.parallel_config = parallel_config @@ -41,6 +42,7 @@ class TPUModelRunner: self.cache_config = cache_config self.load_config = load_config self.vision_language_config = vision_language_config + self.is_driver_worker = is_driver_worker self.block_size = self.cache_config.block_size self.max_num_blocks_per_seq = (self.model_config.max_model_len // @@ -373,6 +375,8 @@ class TPUModelRunner: inputs = self.prepare_inputs(seq_group_metadata_list) next_token_ids = self.model(inputs[0], inputs[1], kv_caches, *inputs[2:]) + if not self.is_driver_worker: + return [] next_token_ids = next_token_ids.cpu().tolist() i = 0 diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 04576015d..828bb89d7 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -34,6 +34,7 @@ class TPUWorker(LoraNotSupportedWorkerBase): local_rank: int, rank: int, distributed_init_method: str, + is_driver_worker: bool, ) -> None: self.model_config = model_config self.parallel_config = parallel_config @@ -45,6 +46,7 @@ class TPUWorker(LoraNotSupportedWorkerBase): self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method + self.is_driver_worker = is_driver_worker assert self.device_config.device_type == "tpu" if self.cache_config.cache_dtype == "auto": @@ -53,10 +55,14 @@ class TPUWorker(LoraNotSupportedWorkerBase): self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_config.cache_dtype] - self.model_runner = TPUModelRunner(model_config, parallel_config, - scheduler_config, device_config, - cache_config, load_config, - vision_language_config) + self.model_runner = TPUModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config, + vision_language_config, + is_driver_worker=is_driver_worker) def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" @@ -175,16 +181,13 @@ class TPUWorker(LoraNotSupportedWorkerBase): def execute_model( self, - execute_model_req: Optional[ExecuteModelRequest] = None + execute_model_req: Optional[ExecuteModelRequest] = None, ) -> List[SamplerOutput]: - if execute_model_req is None: - return [] - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - num_seq_groups = len(seq_group_metadata_list) - if num_seq_groups == 0: + if not self.is_driver_worker: + self._execute_model_non_driver() return [] + assert execute_model_req is not None # Currently, TPUWorker does not support swapping. # TODO(woosuk): Support block copying. assert len(execute_model_req.blocks_to_swap_in) == 0, ( @@ -193,6 +196,16 @@ class TPUWorker(LoraNotSupportedWorkerBase): "Swapping is not supported for the TPU backend.") assert len(execute_model_req.blocks_to_copy) == 0 + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + assert len(seq_group_metadata_list) > 0 output = self.model_runner.execute_model(seq_group_metadata_list, self.tpu_cache) return [output] + + def start_worker_execution_loop(self) -> None: + while self._execute_model_non_driver(): + pass + + def _execute_model_non_driver(self) -> bool: + self.model_runner.execute_model(None, self.tpu_cache) + return True -- GitLab From dd793d1de59b5efad25f4794b68cb935824c7a11 Mon Sep 17 00:00:00 2001 From: Matt Wong <156021403+mawong-amd@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:56:15 -0500 Subject: [PATCH 148/376] [Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422) --- CMakeLists.txt | 20 +- Dockerfile.rocm | 209 ++++++++++++------ cmake/utils.cmake | 20 +- .../getting_started/amd-installation.rst | 6 +- tests/async_engine/test_openapi_server_ray.py | 4 +- tests/distributed/test_utils.py | 17 +- tests/entrypoints/test_openai_embedding.py | 4 +- tests/entrypoints/test_openai_server.py | 4 +- tests/entrypoints/test_openai_vision.py | 4 +- tests/utils.py | 38 +++- vllm/config.py | 10 +- .../custom_all_reduce_utils.py | 11 +- vllm/executor/multiproc_gpu_executor.py | 8 +- vllm/utils.py | 16 +- vllm/worker/worker_base.py | 10 +- 15 files changed, 259 insertions(+), 122 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa15b632c..801429096 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,8 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # versions are derived from Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0") -set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") -set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0") # # Try to find python package with an executable that exactly matches @@ -98,18 +97,11 @@ elseif(HIP_FOUND) # .hip extension automatically, HIP must be enabled explicitly. enable_language(HIP) - # ROCm 5.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} " - "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.") - endif() - - # ROCm 6.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " - "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") + # ROCm 5.X and 6.X + if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} " + "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 6bda69685..652f04adf 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,34 +1,35 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ - ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - +# Default ROCm 6.1 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Tested and supported base rocm/pytorch images +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" + +# Whether to build CK-based flash-attention +# If 0, will not build flash attention +# This is useful for gfx target where flash-attention is not supported +# (i.e. those that do not appear in `FA_GFX_ARCHS`) +# Triton FA is used by default on ROCm now so this is unnecessary. +ARG BUILD_FA="1" ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - ARG FA_BRANCH="ae7928c" -RUN echo "FA_BRANCH is $FA_BRANCH" -# whether to build flash-attention -# if 0, will not build flash attention -# this is useful for gfx target where flash-attention is not supported -# In that case, we need to use the python reference attention implementation in vllm -ARG BUILD_FA="1" - -# whether to build triton on rocm +# Whether to build triton on rocm ARG BUILD_TRITON="1" +ARG TRITON_BRANCH="0ef1848" -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y +### Base image build stage +FROM $BASE_IMAGE AS base + +# Import arg(s) defined before this build stage +ARG PYTORCH_ROCM_ARCH # Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ @@ -39,79 +40,159 @@ RUN apt-get update && apt-get install -y \ build-essential \ wget \ unzip \ - nvidia-cuda-toolkit \ tmux \ ccache \ && rm -rf /var/lib/apt/lists/* -### Mount Point ### -# When launching the container, mount the code directory to /app +# When launching the container, mount the code directory to /vllm-workspace ARG APP_MOUNT=/vllm-workspace -VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT} -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)" +# Install torch == 2.4.0 on ROCm +RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \ + *"rocm-6.0"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \ + *"rocm-6.1"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ + *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: -# Install ROCm flash-attention -RUN if [ "$BUILD_FA" = "1" ]; then \ - mkdir libs \ +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV CCACHE_DIR=/root/.cache/ccache + + +### AMD-SMI build stage +FROM base AS build_amdsmi +# Build amdsmi wheel always +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=/install + + +### Flash-Attention wheel build stage +FROM base AS build_fa +ARG BUILD_FA +ARG FA_GFX_ARCHS +ARG FA_BRANCH +# Build ROCm flash-attention wheel if `BUILD_FA = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_FA" = "1" ]; then \ + mkdir -p libs \ && cd libs \ && git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ + && git checkout "${FA_BRANCH}" \ && git submodule update --init \ - && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd ..; \ + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \ + && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \ + *) ;; esac \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi -# build triton -RUN if [ "$BUILD_TRITON" = "1" ]; then \ +### Triton wheel build stage +FROM base AS build_triton +ARG BUILD_TRITON +ARG TRITON_BRANCH +# Build triton wheel if `BUILD_TRITON = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ - && pip uninstall -y triton \ - && git clone https://github.com/ROCm/triton.git \ - && cd triton/python \ - && pip3 install . \ - && cd ../..; \ + && git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout "${TRITON_BRANCH}" \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -WORKDIR /vllm-workspace + +### Final vLLM build stage +FROM base AS final +# Import the vLLM development directory from the build context COPY . . -#RUN python3 -m pip install pynvml # to be removed eventually -RUN python3 -m pip install --upgrade pip numba +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ + *) ;; esac + +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --upgrade numba scipy huggingface-hub[cli] -# make sure punica kernels are built (for LoRA) +# Make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 # Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +# Silences the HF Tokenizers warning +ENV TOKENIZERS_PARALLELISM=false -ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ +RUN --mount=type=cache,target=${CCACHE_DIR} \ --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \ - && python3 setup.py install \ - && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \ - && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \ - && cd .. + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.0"*) \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ + *"rocm-6.1"*) \ + # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM + wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \ + && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + # Prevent interference if torch bundles its own HIP runtime + && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ + *) ;; esac \ + && python3 setup.py clean --all \ + && python3 setup.py develop + +# Copy amdsmi wheel into final image +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ + mkdir -p libs \ + && cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y amdsmi; +# Copy triton wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y triton; fi + +# Copy flash-attn wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y flash-attn; fi + +# Install wheels that were built to the final image +RUN --mount=type=cache,target=/root/.cache/pip \ + if ls libs/*.whl; then \ + pip install libs/*.whl; fi CMD ["/bin/bash"] diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 071e16336..4869cad54 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if (${GPU_LANG} STREQUAL "HIP") # # `GPU_ARCHES` controls the `--offload-arch` flags. - # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled - # via the `PYTORCH_ROCM_ARCH` env variable. # - + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() # # Find the intersection of the supported + detected architectures to # set the module architecture flags. # - - set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") - set(${GPU_ARCHES}) - foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS}) + foreach (_ARCH ${HIP_ARCHITECTURES}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() @@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if(NOT ${GPU_ARCHES}) message(FATAL_ERROR - "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") endif() diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 61fcd45a2..cc41d4729 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -88,7 +88,7 @@ Option 2: Build from source - `Pytorch `_ - `hipBLAS `_ -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started `_ @@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl $ cd vllm $ pip install -U -r requirements-rocm.txt - $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation + $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation .. tip:: - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. + - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - The ROCm version of pytorch, ideally, should match the ROCm driver version. diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index cc05d79e5..332937b87 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,7 +4,7 @@ import pytest # and debugging. import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @@ -12,7 +12,7 @@ MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 49d11daca..9ff11b0d2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,8 +1,8 @@ -import os - import ray -from vllm.utils import cuda_device_count_stateless +import vllm.envs as envs +from vllm.utils import (cuda_device_count_stateless, is_hip, + update_environment_variables) @ray.remote @@ -12,16 +12,21 @@ class _CUDADeviceCountStatelessTestActor: return cuda_device_count_stateless() def set_cuda_visible_devices(self, cuda_visible_devices: str): - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) def get_cuda_visible_devices(self): - return os.environ["CUDA_VISIBLE_DEVICES"] + return envs.CUDA_VISIBLE_DEVICES def test_cuda_device_count_stateless(): """Test that cuda_device_count_stateless changes return value if CUDA_VISIBLE_DEVICES is changed.""" - + if is_hip(): + # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion + # is handled by `update_environment_variables` + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES}) actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore num_gpus=2).remote() assert sorted(ray.get( diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py index 2496d2ac3..45f701733 100644 --- a/tests/entrypoints/test_openai_embedding.py +++ b/tests/entrypoints/test_openai_embedding.py @@ -2,7 +2,7 @@ import openai import pytest import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -11,7 +11,7 @@ pytestmark = pytest.mark.openai @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c22a675ff..5196d8181 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -16,7 +16,7 @@ from openai import BadRequestError from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -81,7 +81,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index 03dc5d116..0e8d88b76 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -8,7 +8,7 @@ import ray from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer MODEL_NAME = "llava-hf/llava-1.5-7b-hf" LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / @@ -27,7 +27,7 @@ pytestmark = pytest.mark.openai @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/utils.py b/tests/utils.py index 174efca4a..2a5f82b91 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -15,9 +15,30 @@ from vllm.distributed import (ensure_model_parallel_initialized, from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.utils import get_open_port, is_hip -if (not is_hip()): +if is_hip(): + from amdsmi import (amdsmi_get_gpu_vram_usage, + amdsmi_get_processor_handles, amdsmi_init, + amdsmi_shut_down) + + @contextmanager + def _nvml(): + try: + amdsmi_init() + yield + finally: + amdsmi_shut_down() +else: from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlInit) + nvmlInit, nvmlShutdown) + + @contextmanager + def _nvml(): + try: + nvmlInit() + yield + finally: + nvmlShutdown() + # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) @@ -160,20 +181,25 @@ def error_on_warning(): yield +@_nvml() def wait_for_gpu_memory_to_clear(devices: List[int], threshold_bytes: int, timeout_s: float = 120) -> None: # Use nvml instead of pytorch to reduce measurement error from torch cuda # context. - nvmlInit() start_time = time.time() while True: output: Dict[int, str] = {} output_raw: Dict[int, float] = {} for device in devices: - dev_handle = nvmlDeviceGetHandleByIndex(device) - mem_info = nvmlDeviceGetMemoryInfo(dev_handle) - gb_used = mem_info.used / 2**30 + if is_hip(): + dev_handle = amdsmi_get_processor_handles()[device] + mem_info = amdsmi_get_gpu_vram_usage(dev_handle) + gb_used = mem_info["vram_used"] / 2**10 + else: + dev_handle = nvmlDeviceGetHandleByIndex(device) + mem_info = nvmlDeviceGetMemoryInfo(dev_handle) + gb_used = mem_info.used / 2**30 output_raw[device] = gb_used output[device] = f'{gb_used:.02f}' diff --git a/vllm/config.py b/vllm/config.py index 0217a2b56..0c4d770e4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -7,13 +7,15 @@ from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple, import torch from transformers import PretrainedConfig, PreTrainedTokenizerBase +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu, is_xpu) + is_hip, is_neuron, is_tpu, is_xpu, + update_environment_variables) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -634,6 +636,12 @@ class ParallelConfig: self.distributed_executor_backend = backend logger.info("Defaulting to use %s for distributed inference", backend) + # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init, + # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by + # the update_environment_variables function) + if is_hip() and envs.CUDA_VISIBLE_DEVICES: + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES}) self._verify_args() diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index d3e41fa71..6f1aaed98 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -13,7 +13,8 @@ import torch.multiprocessing as mp import vllm.envs as envs from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.logger import init_logger -from vllm.utils import cuda_device_count_stateless +from vllm.utils import (cuda_device_count_stateless, + update_environment_variables) logger = init_logger(__name__) @@ -24,7 +25,8 @@ def producer(batch_src: Sequence[int], result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) lib = CudaRTLibrary() for i in batch_src: @@ -56,7 +58,8 @@ def consumer(batch_tgt: Sequence[int], result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) lib = CudaRTLibrary() for j in batch_tgt: @@ -123,7 +126,7 @@ def can_actually_p2p( processes for testing all pairs of GPUs in batch. The trick is to reset the device after each test (which is not available in PyTorch). """ # noqa - cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES # pass the CUDA_VISIBLE_DEVICES to the child process # to make sure they see the same set of GPUs diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index e63e5a3a0..a5b1d27f2 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -11,7 +11,8 @@ from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (cuda_device_count_stateless, get_distributed_init_method, get_open_port, - get_vllm_instance_id, make_async) + get_vllm_instance_id, make_async, + update_environment_variables) logger = init_logger(__name__) @@ -25,8 +26,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers if "CUDA_VISIBLE_DEVICES" not in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = (",".join( - map(str, range(world_size)))) + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() diff --git a/vllm/utils.py b/vllm/utils.py index f0c7df5cf..92abdb3fb 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -376,6 +376,10 @@ def get_open_port() -> int: def update_environment_variables(envs: Dict[str, str]): + if is_hip() and "CUDA_VISIBLE_DEVICES" in envs: + # Propagate changes to CUDA_VISIBLE_DEVICES to + # ROCm's HIP_VISIBLE_DEVICES as well + envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"] for k, v in envs.items(): if k in os.environ and os.environ[k] != v: logger.warning( @@ -779,9 +783,14 @@ def _cuda_device_count_stateless( if not torch.cuda._is_compiled(): return 0 - # bypass _device_count_nvml() if rocm (not supported) - nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml() - r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count + if is_hip(): + # ROCm uses amdsmi instead of nvml for stateless device count + # This requires a sufficiently modern version of Torch 2.4.0 + raw_count = torch.cuda._device_count_amdsmi() if (hasattr( + torch.cuda, "_device_count_amdsmi")) else -1 + else: + raw_count = torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count return r @@ -795,7 +804,6 @@ def cuda_device_count_stateless() -> int: # This can be removed and simply replaced with torch.cuda.get_device_count # after https://github.com/pytorch/pytorch/pull/122815 is released. - return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index dc09718de..99482aa93 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Set, Tuple from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (enable_trace_function_call_for_thread, +from vllm.utils import (enable_trace_function_call_for_thread, is_hip, update_environment_variables) logger = init_logger(__name__) @@ -125,6 +125,14 @@ class WorkerWrapperBase: # overwriting CUDA_VISIBLE_DEVICES is desired behavior # suppress the warning in `update_environment_variables` del os.environ[key] + if is_hip(): + hip_env_var = "HIP_VISIBLE_DEVICES" + if hip_env_var in os.environ: + logger.warning( + "Ignoring pre-set environment variable `%s=%s` as " + "%s has also been set, which takes precedence.", + hip_env_var, os.environ[hip_env_var], key) + os.environ.pop(hip_env_var, None) update_environment_variables(envs) def init_worker(self, *args, **kwargs): -- GitLab From f178e56c68d97e3a29a8a885a09dd61f8d534732 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 25 Jun 2024 16:58:23 -0700 Subject: [PATCH 149/376] [Hardware][TPU] Raise errors for unsupported sampling params (#5850) --- vllm/worker/tpu_model_runner.py | 63 +++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 2d8fffe5a..2c70c1f91 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -20,6 +20,8 @@ from vllm.utils import make_tensor_with_pad logger = init_logger(__name__) _PAD_SLOT_ID = 0 # FIXME(woosuk) +# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow. +_ENABLE_TOP_P = False class TPUModelRunner: @@ -339,9 +341,34 @@ class TPUModelRunner: assert seq_group_metadata.sampling_params is not None sampling_params = seq_group_metadata.sampling_params + # NOTE(woosuk): Here we mimic argmax sampling by applying a very + # low temperature. This is not accurate. t.append(sampling_params.temperature if sampling_params.temperature >= 1e-5 else 1e-5) + if sampling_params.top_p != 1 and not _ENABLE_TOP_P: + raise NotImplementedError( + "Top-p sampling is currently disabled for the TPU backend " + "due to performance issues.") p.append(sampling_params.top_p) + if sampling_params.top_k != -1: + raise NotImplementedError( + "Top-k sampling is currently disabled for the TPU backend " + "due to performance issues.") + if sampling_params.best_of > 1: + raise NotImplementedError( + "best_of > 1 is not currently supported by the TPU " + "backend.") + if sampling_params.use_beam_search: + raise NotImplementedError( + "Beam search is not supported by the TPU backend.") + if sampling_params.logprobs is not None: + raise NotImplementedError( + "logprobs is not currently supported by the TPU backend.") + if sampling_params.prompt_logprobs is not None: + raise NotImplementedError( + "prompt_logprobs is not currently supported by the TPU " + "backend.") + num_paddings = padded_batch_size - len(seq_group_metadata_list) t += [1.0] * num_paddings p += [1.0] * num_paddings @@ -350,35 +377,32 @@ class TPUModelRunner: p = torch.tensor(p, dtype=torch.float32, device=self.device) return t, p - def prepare_inputs( + def _execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ): - assert seq_group_metadata_list is not None + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> List[CompletionSequenceGroupOutput]: + # Prepare inputs. assert len(seq_group_metadata_list) > 0 # NOTE: We assume that all sequences in the group are all prompts or # all decodes. - if seq_group_metadata_list[0].is_prompt: + is_prompt = seq_group_metadata_list[0].is_prompt + if is_prompt: inputs = self._prepare_prompt(seq_group_metadata_list) else: inputs = self._prepare_decode(seq_group_metadata_list) padded_batch_size = inputs[0].shape[0] - sample_inputs = self._prepare_sample(seq_group_metadata_list, - padded_batch_size) - return inputs + sample_inputs + t, p = self._prepare_sample(seq_group_metadata_list, padded_batch_size) - def _execute_model( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> List[CompletionSequenceGroupOutput]: - inputs = self.prepare_inputs(seq_group_metadata_list) + # Execute the model. next_token_ids = self.model(inputs[0], inputs[1], kv_caches, - *inputs[2:]) - if not self.is_driver_worker: - return [] + *inputs[2:], t, p) + # Retrieve the outputs to CPU. next_token_ids = next_token_ids.cpu().tolist() + # NOTE(woosuk): Minimal code to construct the sampler outputs. + # The TPU backend does not reuse the sampler, since the TPU backend + # does not support the advanced sampling parameters such as logprobs. i = 0 sampler_outputs = [] for seq_group_metadata in seq_group_metadata_list: @@ -400,6 +424,7 @@ class TPUModelRunner: kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], ) -> SamplerOutput: assert seq_group_metadata_list is not None + assert len(seq_group_metadata_list) > 0 if seq_group_metadata_list[0].is_prompt: # NOTE(woosuk): To reduce the compilation time, we only compile the # prefill inputs with batch size 1. Because the scheduler is not @@ -492,8 +517,8 @@ class ModelWrapper(nn.Module): logits = self.model.compute_logits(hidden_states, sampling_metadata) logits = logits / t.unsqueeze(dim=1) - # FIXME(woosuk): Disabled top-p sampling since it's too slow. - # logits = _apply_top_p(logits, p.unsqueeze(dim=1)) + if _ENABLE_TOP_P: + logits = _apply_top_p(logits, p.unsqueeze(dim=1)) probs = torch.softmax(logits, dim=-1, dtype=torch.float32) # FIXME(woosuk): best_of > 1 is not supported. next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1) -- GitLab From c2a8ac75e03aec19dad397a8e64377d37c67239a Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 26 Jun 2024 01:04:08 +0100 Subject: [PATCH 150/376] [CI/Build] Add E2E tests for MLPSpeculator (#5791) Signed-off-by: Thomas Parnell --- tests/spec_decode/e2e/test_mlp_correctness.py | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py new file mode 100644 index 000000000..9a9f2acbb --- /dev/null +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -0,0 +1,216 @@ +"""This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. + +However, we still need to verify below scenario could be passed: + * Batch size 1 greedy equality + * Batch size >1 greedy equality + * Test greedy equality under preemption + * Test greedy equality under various number of speculative tokens. + +With those tests, we can say at least, MLPSpeculator would not break the +correctess for the target model outputs. +""" + +import pytest + +from .conftest import run_greedy_equality_correctness_test + +# main model +MAIN_MODEL = "ibm-granite/granite-3b-code-instruct" + +# speculative model +SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator" + +# max. number of speculative tokens: this corresponds to +# n_predict in the config.json of the speculator model. +MAX_SPEC_TOKENS = 5 + +# precision +PRECISION = "float16" + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality with different batch size.""" + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 8, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator, + test_llm_generator, + batch_size: int, + output_len: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": SPEC_MODEL, + "num_speculative_tokens": k, + } + # Try a range of num. speculative tokens + for k in range(1, 1 + MAX_SPEC_TOKENS) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_different_k(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode with different values of num_speculative_tokens. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": SPEC_MODEL, + "speculative_disable_by_batch_size": 4 + }]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode when speculation is disabled for large + batch sizes. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) -- GitLab From 82079729ccd0830ce77fcc5fd7ea2be3bf81ccaf Mon Sep 17 00:00:00 2001 From: aws-patlange <90803007+aws-patlange@users.noreply.github.com> Date: Tue, 25 Jun 2024 19:52:10 -0700 Subject: [PATCH 151/376] [Bugfix] Fix assertion in NeuronExecutor (#5841) --- vllm/executor/neuron_executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c5e2fb0f6..1a3329749 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,9 +48,9 @@ class NeuronExecutor(ExecutorBase): def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - assert (execute_model_req.blocks_to_swap_in == {} - and execute_model_req.blocks_to_swap_out == {} - and execute_model_req.blocks_to_copy == {}), ( + assert (not execute_model_req.blocks_to_swap_in + and not execute_model_req.blocks_to_swap_out + and not execute_model_req.blocks_to_copy), ( "Cache operations are not supported for Neuron backend.") assert execute_model_req.num_lookahead_slots == 0, ( "lookahead not supported for Neuron backend.") -- GitLab From dda4811591fdb90d263bc9b8ac522436369aef13 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 25 Jun 2024 20:30:03 -0700 Subject: [PATCH 152/376] [Core] Refactor Worker and ModelRunner to consolidate control plane communication (#5408) Signed-off-by: Stephanie Wang Signed-off-by: Stephanie Co-authored-by: Stephanie --- tests/worker/test_model_input.py | 152 ++++++++ tests/worker/test_model_runner.py | 57 +-- vllm/attention/backends/abstract.py | 6 +- vllm/attention/backends/blocksparse_attn.py | 4 +- vllm/attention/backends/flash_attn.py | 4 +- vllm/attention/backends/flashinfer.py | 4 +- vllm/attention/backends/ipex_attn.py | 4 +- vllm/attention/backends/pallas.py | 4 +- vllm/attention/backends/rocm_flash_attn.py | 4 +- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 4 +- vllm/executor/distributed_gpu_executor.py | 16 +- vllm/executor/executor_base.py | 4 +- vllm/executor/gpu_executor.py | 2 +- vllm/executor/multiproc_gpu_executor.py | 8 +- vllm/executor/neuron_executor.py | 3 +- vllm/executor/ray_gpu_executor.py | 5 +- vllm/sequence.py | 3 +- vllm/spec_decode/mlp_speculator_worker.py | 3 +- vllm/worker/cpu_model_runner.py | 161 +++++---- vllm/worker/cpu_worker.py | 85 ++--- vllm/worker/embedding_model_runner.py | 129 +++---- vllm/worker/model_runner.py | 367 +++++++++++--------- vllm/worker/model_runner_base.py | 157 +++++++++ vllm/worker/neuron_model_runner.py | 64 +++- vllm/worker/neuron_worker.py | 39 +-- vllm/worker/worker.py | 129 ++----- vllm/worker/worker_base.py | 170 ++++++++- vllm/worker/xpu_model_runner.py | 91 ++++- 29 files changed, 1108 insertions(+), 575 deletions(-) create mode 100644 tests/worker/test_model_input.py create mode 100644 vllm/worker/model_runner_base.py diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py new file mode 100644 index 000000000..ae818ee36 --- /dev/null +++ b/tests/worker/test_model_input.py @@ -0,0 +1,152 @@ +import dataclasses +from typing import List, Tuple, Type + +import torch + +from vllm.attention import AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.worker.embedding_model_runner import ( + ModelInputForGPUWithPoolingMetadata) +from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + + +class MockAttentionBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + raise NotImplementedError + + @staticmethod + def get_impl_cls(): + raise NotImplementedError + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return AttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + raise NotImplementedError + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + pass + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + pass + + +def test_model_runner_input(): + sampling_metadata = SamplingMetadata( + ["seq_group"], + "selected_token_indices", + "categorized_sample_indices", + "num_prompts", + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithSamplingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + sampling_metadata=sampling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithSamplingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # For sampling metadata, only selected_token_indices is copied. + assert (received_model_input.sampling_metadata.selected_token_indices == + sampling_metadata.selected_token_indices) + assert received_model_input.sampling_metadata.seq_groups is None + + +def test_embedding_model_runner_input(): + pooling_metadata = PoolingMetadata( + seq_groups=[[0]], + seq_data={}, + prompt_lens=[1], + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithPoolingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + pooling_metadata=pooling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithPoolingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # Pooling metadata is not broadcast. + assert received_model_input.pooling_metadata is None diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index dd0d3bf50..e1775790c 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -61,12 +61,13 @@ def test_prepare_prompt(batch_size): expected_selected_token_indices.append(selected_token_start_idx + seq_len - 1) selected_token_start_idx += seq_len - model_input = model_runner._prepare_model_input(seq_group_metadata_list) + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) input_tokens = model_input.input_tokens input_positions = model_input.input_positions attn_metadata = model_input.attn_metadata return_seq_lens = model_input.seq_lens - slot_mapping = model_input.slot_mapping + slot_mapping = attn_metadata.slot_mapping assert return_seq_lens == seq_lens assert len(slot_mapping) == len(input_tokens) @@ -174,10 +175,11 @@ def test_prepare_decode_cuda_graph(batch_size): assert seq_group_metadata.token_chunk_size == 1 seq_group_metadata_list.append(seq_group_metadata) - model_input = model_runner._prepare_model_input(seq_group_metadata_list) + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) input_tokens, input_positions, attn_metadata, slot_mapping = ( model_input.input_tokens, model_input.input_positions, - model_input.attn_metadata, model_input.slot_mapping) + model_input.attn_metadata, model_input.attn_metadata.slot_mapping) assert len(slot_mapping) == len(input_tokens) expected_bs = _get_graph_batch_size(len(seq_group_metadata_list)) @@ -259,32 +261,29 @@ def test_empty_seq_group(): enforce_eager=False, ) seq_group_metadata_list: List[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input(seq_group_metadata_list) - input_tokens, input_positions, attn_metadata, slot_mapping = ( + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) + input_tokens, input_positions, attn_metadata = ( model_input.input_tokens, model_input.input_positions, model_input.attn_metadata, - model_input.slot_mapping, ) - assert len(input_tokens) == 0 - assert len(input_positions) == 0 + assert input_tokens is None + assert input_positions is None assert attn_metadata is None - assert len(slot_mapping) == 0 - - model_input = model_runner._prepare_model_input(seq_group_metadata_list) - (input_tokens, input_positions, attn_metadata, slot_mapping, - return_seq_lens) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.attn_metadata, - model_input.slot_mapping, - model_input.seq_lens, - ) - assert len(input_tokens) == 0 - assert len(input_positions) == 0 + + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) + (input_tokens, input_positions, attn_metadata, return_seq_lens) = ( + model_input.input_tokens, + model_input.input_positions, + model_input.attn_metadata, + model_input.seq_lens, + ) + assert input_tokens is None + assert input_positions is None assert attn_metadata is None - assert len(slot_mapping) == 0 - assert len(return_seq_lens) == 0 + assert return_seq_lens is None @pytest.fixture @@ -353,8 +352,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): seq_group_metadata_list.append(seq_group_metadata) decode_metadata_list.append(seq_group_metadata) - (input_tokens, input_positions, attn_metadata, _, _, _, - _) = model_runner.prepare_input_tensors(seq_group_metadata_list) + model_input = model_runner.prepare_model_input(seq_group_metadata_list) + (input_tokens, input_positions, attn_metadata) = ( + model_input.input_tokens, + model_input.input_positions, + model_input.attn_metadata, + ) prefill_meta_actual = attn_metadata.prefill_metadata decode_meta_actual = attn_metadata.decode_metadata @@ -367,7 +370,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): # Verify attn metadata is consistent. We don't need to test individual # values here because they are tested above. - attn_metadata = model_runner._prepare_model_input( + attn_metadata = model_runner._prepare_model_input_tensors( seq_group_metadata_list).attn_metadata for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata), diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 6396103bf..40768532f 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -21,9 +21,13 @@ class AttentionBackend(ABC): @staticmethod @abstractmethod - def make_metadata(*args, **kwargs) -> "AttentionMetadata": + def get_metadata_cls() -> Type["AttentionMetadata"]: raise NotImplementedError + @classmethod + def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": + return cls.get_metadata_cls()(*args, **kwargs) + @staticmethod @abstractmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index dce2b8361..7b4578fcd 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -90,8 +90,8 @@ class BlocksparseFlashAttentionBackend(AttentionBackend): return BlocksparseFlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata": - return BlocksparseFlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return BlocksparseFlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 1c48e2a0b..8cb5c3101 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -25,8 +25,8 @@ class FlashAttentionBackend(AttentionBackend): return FlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata": - return FlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7b7959d25..535d30b55 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -22,8 +22,8 @@ class FlashInferBackend(AttentionBackend): return FlashInferImpl @staticmethod - def make_metadata(*args, **kwargs) -> "FlashInferMetadata": - return FlashInferMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashInferMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index f09b24f2a..5114bfa6e 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -25,8 +25,8 @@ class IpexAttnBackend(AttentionBackend): return IpexAttnBackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "IpexAttnMetadata": - return IpexAttnMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["IpexAttnMetadata"]: + return IpexAttnMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b203c5ec5..62b4a144f 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -16,8 +16,8 @@ class PallasAttentionBackend(AttentionBackend): return PallasAttentionBackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "PallasMetadata": - return PallasMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["PallasMetadata"]: + return PallasMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 9294068c6..81fabdbdf 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -25,8 +25,8 @@ class ROCmFlashAttentionBackend(AttentionBackend): return ROCmFlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata": - return ROCmFlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return ROCmFlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c01e0a0a3..63f8466da 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -31,8 +31,8 @@ class TorchSDPABackend(AttentionBackend): return TorchSDPABackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata": - return TorchSDPAMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return TorchSDPAMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 0fecd9f6e..ff449c3ff 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -28,8 +28,8 @@ class XFormersBackend(AttentionBackend): return XFormersImpl @staticmethod - def make_metadata(*args, **kwargs) -> "XFormersMetadata": - return XFormersMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return XFormersMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index 235b5bc47..d8693e636 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -64,8 +64,8 @@ class DistributedGPUExecutor(GPUExecutor): num_cpu_blocks=num_cpu_blocks) def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[SamplerOutput]]: if self.parallel_worker_tasks is None: self.parallel_worker_tasks = self._run_workers( "start_worker_execution_loop", @@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor): if self.parallel_worker_tasks is None: return - self._driver_execute_model() + self._driver_execute_model(execute_model_req=None) parallel_worker_tasks = self.parallel_worker_tasks self.parallel_worker_tasks = None # Ensure that workers exit model loop cleanly @@ -123,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor): @abstractmethod def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. + Passing None will cause the driver to stop the model execution loop + running in each of the remote workers. In this case, this method + returns None. Otherwise, this method returns the model output. """ raise NotImplementedError diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 7c2520b5a..d7c19622e 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -69,8 +69,8 @@ class ExecutorBase(ABC): @abstractmethod def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[SamplerOutput]]: """Executes at least one model step on the given sequences.""" raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 0a654200e..5522b5322 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -87,7 +87,7 @@ class GPUExecutor(ExecutorBase): def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> List[Union[SamplerOutput, PoolerOutput]]: + ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: output = self.driver_worker.execute_model(execute_model_req) return output diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index a5b1d27f2..6aebb4702 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -78,16 +78,14 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): worker_monitor.close() def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution loop running in each of the remote workers. """ - return self.driver_worker.execute_model( - execute_model_req=execute_model_req) + return self.driver_worker.execute_model(execute_model_req) def _run_workers( self, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 1a3329749..53107dada 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -55,8 +55,7 @@ class NeuronExecutor(ExecutorBase): assert execute_model_req.num_lookahead_slots == 0, ( "lookahead not supported for Neuron backend.") - output = self.driver_worker.execute_model( - execute_model_req.seq_group_metadata_list) + output = self.driver_worker.execute_model(execute_model_req) return output def add_lora(self, lora_request: LoRARequest) -> bool: diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index fc83c5528..faa500c2d 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -190,9 +190,8 @@ class RayGPUExecutor(DistributedGPUExecutor): max_parallel_loading_workers) def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution diff --git a/vllm/sequence.py b/vllm/sequence.py index 287e1b9df..0925d1546 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -887,7 +887,8 @@ class HiddenStates: @dataclass class ExecuteModelRequest: - """The model execution request.""" + """The model execution request, containing CPU metadata only. The LLM + engine should create an instance of this class for each request batch.""" # The sequence group metadata list. seq_group_metadata_list: List[SequenceGroupMetadata] # Blocks to swap in. List of CPU -> GPU block number. diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index 0926e13be..6c1c8da57 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -7,7 +7,6 @@ from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.worker.model_runner import ModelInput class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): @@ -56,7 +55,7 @@ class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, List[int], List[int]]: if not seq_group_metadata_list: - return ModelInput.empty(self.device) + return torch.empty(0, device=self.device), [], [] input_tokens: List[int] = [] seq_lens: List[int] = [] diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index d539f5693..e3464c0d3 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,5 +1,6 @@ from collections import defaultdict -from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -8,20 +9,64 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) _PAD_SLOT_ID = -1 -class CPUModelRunner: +@dataclass(frozen=True) +class CPUModelInput(ModelRunnerInputBase): + """ + Used by the CPUModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + attn_metadata: Optional["AttentionMetadata"] = None + sampling_metadata: Optional["SamplingMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["CPUModelInput"], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None + ) -> "CPUModelInput": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class CPUModelRunner(ModelRunnerBase[CPUModelInput]): def __init__( self, @@ -270,86 +315,70 @@ class CPUModelRunner: attn_metadata, ) - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> CPUModelInput: + return CPUModelInput.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Optional[Dict[str, torch.Tensor]]]: + ) -> CPUModelInput: multi_modal_kwargs = None - if self.is_driver_worker: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode(seq_group_metadata_list) - seq_lens = [] - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since CPU worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - pin_memory=False) - # Broadcast the metadata. - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - } - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_kwargs + ) = self._prepare_prompt(seq_group_metadata_list) else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - attn_metadata = self.attn_backend.make_metadata(**metadata_dict) - sampling_metadata = SamplingMetadata( - seq_groups=None, - seq_data=None, - seq_lens=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - generators=None, - ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, multi_modal_kwargs) + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode(seq_group_metadata_list) + seq_lens = [] + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # query_lens is not needed if chunked prefill is not + # supported. Since CPU worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + pin_memory=False) + return CPUModelInput( + input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + sampling_metadata=sampling_metadata, + ) @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: CPUModelInput, kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - model_executable = self.model execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } - if self.vision_language_config and multi_modal_input is not None: - execute_model_kwargs.update(multi_modal_input) + if (self.vision_language_config + and model_input.multi_modal_kwargs is not None): + execute_model_kwargs.update(model_input.multi_modal_kwargs) hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Only perform sampling in the driver worker. if not self.is_driver_worker: @@ -358,6 +387,6 @@ class CPUModelRunner: # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 914df0c7d..30ee262c7 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,5 +1,5 @@ """A CPU worker class.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import torch import torch.distributed @@ -8,15 +8,15 @@ from vllm.attention import get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.cpu_model_runner import CPUModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, + LoraNotSupportedWorkerBase, WorkerInput) logger = init_logger(__name__) @@ -110,7 +110,7 @@ class CPUCacheEngine: return dtype_size * total -class CPUWorker(LoraNotSupportedWorkerBase): +class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is @@ -154,7 +154,7 @@ class CPUWorker(LoraNotSupportedWorkerBase): # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner = CPUModelRunner( + self.model_runner: CPUModelRunner = CPUModelRunner( model_config, parallel_config, scheduler_config, @@ -255,54 +255,37 @@ class CPUWorker(LoraNotSupportedWorkerBase): for layer_cache in self.cpu_cache: layer_cache.fill_(0) - def cache_copy( + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return self.cpu_cache + + def execute_worker( self, - blocks_to_copy: torch.Tensor, + worker_input: WorkerInput, ) -> None: - if blocks_to_copy.numel() > 0: - self.cache_engine.copy(blocks_to_copy) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine.copy(worker_input.blocks_to_copy) @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: - - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - if self.is_driver_worker: - assert seq_group_metadata_list is not None - num_seq_groups: int = len(seq_group_metadata_list) - assert execute_model_req is not None - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device="cpu", - dtype=torch.int64).view(-1, 2) - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_copy": execute_model_req.blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_copy = data["blocks_to_copy"] - - self.cache_copy(blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.cpu_cache) - - # CPU worker only supports single-step execution. - return [output] + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + assert execute_model_req is not None + num_seq_groups: int = len(execute_model_req.seq_group_metadata_list) + blocks_to_copy = execute_model_req.blocks_to_copy + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device="cpu", + dtype=torch.int64).view(-1, 2) + assert len(execute_model_req.blocks_to_swap_in) == 0 + assert len(execute_model_req.blocks_to_swap_out) == 0 + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_copy=blocks_to_copy, + ) def init_distributed_environment(self) -> None: """Initialize the distributed environment.""" diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 465130d10..3c8dfa2c6 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -1,24 +1,32 @@ -from typing import Dict, List, Optional, Set, Tuple +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Type import torch -from vllm.attention import AttentionMetadata from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata -from vllm.worker.model_runner import ModelRunner +from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU logger = init_logger(__name__) -class EmbeddingModelRunner(ModelRunner): +@dataclasses.dataclass(frozen=True) +class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): + """ + Used by the EmbeddingModelRunner. + """ + pooling_metadata: Optional["PoolingMetadata"] = None + + +class EmbeddingModelRunner( + GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): + _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( + ModelInputForGPUWithPoolingMetadata) def __init__( self, @@ -47,21 +55,22 @@ class EmbeddingModelRunner(ModelRunner): @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + model_input: ModelInputForGPUWithPoolingMetadata, kv_caches: List[torch.Tensor], ) -> Optional[PoolerOutput]: - (input_tokens, input_positions, attn_metadata, pooling_metadata, - lora_requests, lora_mapping, multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) # Currently cuda graph is only supported by the decode phase. - prefill_meta = attn_metadata.prefill_metadata - decode_meta = attn_metadata.decode_metadata + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata if prefill_meta is None and decode_meta.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] model_executable = self.graph_runners[graph_batch_size] else: model_executable = self.model @@ -70,13 +79,14 @@ class EmbeddingModelRunner(ModelRunner): kv_caches = [None] * num_layers execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + execute_model_kwargs.update({"image_input": multi_modal_kwargs}) hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. @@ -84,66 +94,31 @@ class EmbeddingModelRunner(ModelRunner): return None return self.model.pooler(hidden_states=hidden_states, - pooling_metadata=pooling_metadata) + pooling_metadata=model_input.pooling_metadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, + Any]) -> ModelInputForGPUWithPoolingMetadata: + return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) - def prepare_input_tensors( + def prepare_model_input( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata, - Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - # Prepare input tensors. - ( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - _, - lora_mapping, - lora_requests, - multi_modal_kwargs, - slot_mapping, - num_prefill_tokens, - num_decode_tokens, - num_prefills, - ) = self._prepare_model_input(seq_group_metadata_list) - # Prepare PoolingMetadata - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - seq_lens) - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - } - if attn_metadata: - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") - if metadata_dict: - attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - else: - attn_metadata = None - pooling_metadata = PoolingMetadata(seq_groups=None, - seq_data=None, - prompt_lens=None) - - return (input_tokens, input_positions, attn_metadata, pooling_metadata, - lora_requests, lora_mapping, multi_modal_kwargs) + ) -> ModelInputForGPUWithPoolingMetadata: + assert seq_group_metadata_list is not None + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list) + # Prepare PoolingMetadata. + assert model_input.seq_lens is not None + pooling_metadata = self._prepare_pooling(seq_group_metadata_list, + model_input.seq_lens) + + return dataclasses.replace(model_input, + pooling_metadata=pooling_metadata) def _prepare_pooling( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a321eafce..9fdb2ea5d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,8 +1,10 @@ +import dataclasses import gc import time import warnings from collections import defaultdict -from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type, + TypeVar, Union) import numpy as np import torch @@ -12,7 +14,6 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import graph_capture from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -26,6 +27,15 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip, is_pin_memory_available, make_tensor_with_pad) +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) @@ -39,40 +49,90 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ ] _NUM_WARMUP_ITERS = 2 +TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") -class ModelInput(NamedTuple): - input_tokens: torch.Tensor - input_positions: torch.Tensor - attn_metadata: Optional[AttentionMetadata] - seq_lens: List[int] - query_lens: List[int] - lora_mapping: Optional[LoRAMapping] - lora_requests: Set[LoRARequest] - multi_modal_kwargs: Dict[str, torch.Tensor] - slot_mapping: torch.Tensor - num_prefill_tokens: int - num_decode_tokens: int - num_prefills: int - @classmethod - def empty(cls, device): - return ModelInput( - input_tokens=torch.empty(0, device=device), - input_positions=torch.empty(0, device=device), - attn_metadata=None, - seq_lens=[], - query_lens=[], - lora_mapping=None, - lora_requests=set(), - multi_modal_kwargs={}, - slot_mapping=torch.empty(0, device=device), - num_prefill_tokens=0, - num_decode_tokens=0, - num_prefills=0, - ) +@dataclasses.dataclass(frozen=True) +class ModelInputForGPU(ModelRunnerInputBase): + """ + This base class contains metadata needed for the base model forward pass + but not metadata for possible additional steps, e.g., sampling. Model + runners that run additional steps should subclass this method to add + additional fields. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None + attn_metadata: Optional["AttentionMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + return tensor_dict + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForGPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForGPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None + # Used for speculative decoding. We do not broadcast it because it is only + # used by the driver worker. + is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict -class ModelRunner: + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForGPUWithSamplingMetadata": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): + """ + Helper class for shared methods between GPU model runners. + """ + _model_input_cls: Type[TModelInputForGPU] def __init__( self, @@ -241,11 +301,13 @@ class ModelRunner: block_size = self.block_size return (self.max_seq_len_to_capture + block_size - 1) // block_size - def _prepare_model_input( + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> ModelInput: - """Prepare the model input based on a given sequence group. + ) -> TModelInputForGPU: + """Helper method to prepare the model input based on a given sequence + group. Prepares metadata needed for the base model forward pass but not + metadata for possible additional steps, e.g., sampling. The API assumes seq_group_metadata_list is sorted by prefill -> decode. @@ -296,7 +358,7 @@ class ModelRunner: paged_kv_last_page_len: List[int] = [] if len(seq_group_metadata_list) == 0: - return ModelInput.empty(self.device) + return self._model_input_cls() if self.sliding_window is not None: sliding_window_blocks = (self.sliding_window + self.block_size - @@ -646,7 +708,7 @@ class ModelRunner: for k, v in multi_modal_kwargs_list.items() } - return ModelInput( + return self._model_input_cls( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, attn_metadata=attn_metadata, @@ -655,132 +717,8 @@ class ModelRunner: lora_mapping=lora_mapping, lora_requests=lora_requests, multi_modal_kwargs=multi_modal_kwargs, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - num_prefills=num_prefills, - ) - - def prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - # Prepare input tensors. - ( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - query_lens, - lora_mapping, - lora_requests, - multi_modal_kwargs, - slot_mapping, - num_prefill_tokens, - num_decode_tokens, - num_prefills, - ) = self._prepare_model_input(seq_group_metadata_list) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.pin_memory) - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - } - if attn_metadata: - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") - if metadata_dict: - attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - else: - attn_metadata = None - sampling_metadata = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, - ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, lora_requests, lora_mapping, - multi_modal_kwargs) - - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[torch.Tensor], - ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_kwargs - ) = self.prepare_input_tensors(seq_group_metadata_list) - - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) - - # Currently cuda graph is only supported by the decode phase. - prefill_meta = attn_metadata.prefill_metadata - decode_meta = attn_metadata.decode_metadata - if prefill_meta is None and decode_meta.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] - model_executable = self.graph_runners[graph_batch_size] - else: - model_executable = self.model - - hidden_states = model_executable( - input_ids=input_tokens, - positions=input_positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - **multi_modal_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return None - - # Sample the next token. - output: SamplerOutput = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, ) - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert seq_group_metadata_list is not None - if seq_group_metadata_list[0].is_prompt: - hidden_states = hidden_states.index_select( - 0, sampling_metadata.selected_token_indices) - output.hidden_states = hidden_states - - return output - @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. @@ -853,7 +791,8 @@ class ModelRunner: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - self.execute_model(seqs, kv_caches) + model_input = self.prepare_model_input(seqs) + self.execute_model(model_input, kv_caches) torch.cuda.synchronize() return @@ -986,6 +925,110 @@ class ModelRunner: return self.model_config.get_vocab_size() +class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): + """ + GPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( + ModelInputForGPUWithSamplingMetadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForGPUWithSamplingMetadata: + return ( + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> ModelInputForGPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + self.pin_memory) + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + ) -> SamplerOutput: + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + + # Currently cuda graph is only supported by the decode phase. + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata + if prefill_meta is None and decode_meta.use_cuda_graph: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = self.graph_runners[graph_batch_size] + else: + model_executable = self.model + + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + **multi_modal_kwargs, + ) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) + + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return None + + # Sample the next token. + output: SamplerOutput = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + if model_input.is_prompt: + assert model_input.sampling_metadata is not None + hidden_states = hidden_states.index_select( + 0, model_input.sampling_metadata.selected_token_indices) + output.hidden_states = hidden_states + + return output + + class CUDAGraphRunner: def __init__(self, model: nn.Module): diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py new file mode 100644 index 000000000..9b1706035 --- /dev/null +++ b/vllm/worker/model_runner_base.py @@ -0,0 +1,157 @@ +import dataclasses +from abc import ABC, abstractmethod +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + +if TYPE_CHECKING: + from vllm.attention import AttentionMetadata + from vllm.attention.backends.abstract import AttentionBackend + from vllm.model_executor import SamplingMetadata + +T = TypeVar('T', bound="ModelRunnerInputBase") + + +def _add_attn_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + attn_metadata: Optional["AttentionMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + AttentionMetadata fields. + """ + if attn_metadata is not None: + tensor_dict.update(attn_metadata.asdict_zerocopy()) + + +def _init_attn_metadata_from_tensor_dict( + attn_backend: "AttentionBackend", + tensor_dict: Dict[str, Any], +) -> Dict[str, Any]: + """ + Helper method to initialize AttentionMetadata based on an + AttentionBackend and broadcastable AttentionMetadata fields. + """ + # Extract the fields used to create AttentionMetadata. + valid_attn_kwargs = {} + for field in dataclasses.fields(attn_backend.get_metadata_cls()): + val = tensor_dict.pop(field.name, None) + if val is not None: + valid_attn_kwargs[field.name] = val + + attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) + tensor_dict["attn_metadata"] = attn_metadata + return tensor_dict + + +def _init_sampling_metadata_from_tensor_dict( # type: ignore + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Helper method to initialize SamplingMetadata based on broadcastable + SamplingMetadata fields. + """ + from vllm.model_executor import SamplingMetadata + + selected_token_indices = tensor_dict.pop("selected_token_indices", None) + # An empty SamplingMetadata to signal that the worker should skip + # sampling. + if selected_token_indices is not None: + tensor_dict["sampling_metadata"] = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + return tensor_dict + + +def _add_sampling_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + sampling_metadata: Optional["SamplingMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + SamplingMetadata fields. + """ + if sampling_metadata is not None: + tensor_dict["selected_token_indices"] = ( + sampling_metadata.selected_token_indices) + + +@dataclasses.dataclass(frozen=True) +class ModelRunnerInputBase(ABC): + """Local inputs to each worker's model runner. May contain + device-specific data. Different worker backends may have different methods + of converting from the global ExecuteModelRequest produced by the LLM + engine to the worker-local ModelRunnerInputBase objects. + + Model runners that support multi-GPU execution should define a + ModelRunnerInputBase subclass, add their required fields, and specify how to + serialize/deserialize a ModelInput for broadcast between workers. + """ + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + """ + Extract broadcastable fields. Override for fields that require some + custom deserialization. + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def from_broadcasted_tensor_dict( + cls: Type[T], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> T: + """ + Pop fields from the given tensor_dict and populate a new instance of + ModelRunnerInputBase. + """ + raise NotImplementedError + + +class ModelRunnerBase(ABC, Generic[T]): + """ + Model runner interface that abstracts a particular hardware and/or type of + model. Model execution may communicate data with model runners in other + processes, but it should not include control plane metadata communication. + + Each ModelRunnerBase subclass should define a corresponding + ModelRunnerInputBase subclass. + """ + + @abstractmethod + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> T: + """ + Make an instance of a ModelRunnerInputBase from the broadcasted tensor + dict. + """ + raise NotImplementedError + + @abstractmethod + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> T: + """ + Prepare the inputs to ModelRunnerBase.execute_model from an execution + request. This method may move data to the worker's local device. It is + not allowed to communicate with other workers or devices. + """ + raise NotImplementedError + + @torch.inference_mode() + def execute_model( + self, + model_input: T, + kv_caches: Optional[List[torch.Tensor]], + ) -> Optional[SamplerOutput]: + """ + Execute the model on the given input. + """ + raise NotImplementedError diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a336be04e..fec2c97e7 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from torch import nn @@ -10,11 +11,39 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) -class NeuronModelRunner: +@dataclass(frozen=True) +class ModelInputForNeuron(ModelRunnerInputBase): + """ + Used by the NeuronModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + input_block_ids: Optional[torch.Tensor] = None + sampling_metadata: Optional["SamplingMetadata"] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + raise NotImplementedError("ModelInputForNeuron cannot be broadcast.") + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForNeuron": + assert attn_backend is None + return cls.from_broadcasted_tensor_dict(tensor_dict) + + +class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): def __init__( self, @@ -139,10 +168,14 @@ class NeuronModelRunner: return input_tokens, input_positions, input_block_ids - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: + return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]: + ) -> ModelInputForNeuron: # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt @@ -164,30 +197,31 @@ class NeuronModelRunner: self.device, self.pin_memory) - return (input_tokens, input_positions, input_block_ids, - sampling_metadata) + return ModelInputForNeuron(input_tokens=input_tokens, + input_positions=input_positions, + input_block_ids=input_block_ids, + sampling_metadata=sampling_metadata) @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: ModelInputForNeuron, + kv_caches: Optional[List[torch.Tensor]] = None, ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, input_block_ids, sampling_metadata - ) = self.prepare_input_tensors(seq_group_metadata_list) - hidden_states = self.model( - input_ids=input_tokens, - positions=input_positions, - input_block_ids=input_block_ids, + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + input_block_ids=model_input.input_block_ids, ) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d0e6aaed1..307c107dd 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,5 +1,5 @@ """A Neuron worker class.""" -from typing import List, Tuple +from typing import List, Optional, Tuple import torch import torch.distributed @@ -7,12 +7,13 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.model_executor import set_random_seed -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, + LoraNotSupportedWorkerBase, WorkerInput) -class NeuronWorker(LoraNotSupportedWorkerBase): +class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -34,8 +35,9 @@ class NeuronWorker(LoraNotSupportedWorkerBase): from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner = NeuronModelRunner(model_config, parallel_config, - scheduler_config, device_config) + self.model_runner: NeuronModelRunner = NeuronModelRunner( + model_config, parallel_config, scheduler_config, device_config) + self.is_driver_worker = True def init_device(self) -> None: # Set random seed. @@ -73,22 +75,19 @@ class NeuronWorker(LoraNotSupportedWorkerBase): self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> List[SamplerOutput]: - num_seq_groups = len(seq_group_metadata_list) + @property + def do_metadata_broadcast(self) -> bool: + return False - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return None - output = self.model_runner.execute_model(seq_group_metadata_list) - - # Neuron worker only supports single-step output. Wrap the output in a - # list to conform to interface. - return [output] + @torch.inference_mode() + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + return WorkerInput(num_seq_groups=len( + execute_model_req.seq_group_metadata_list), ) def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index c60764ef1..e1944a4f1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,7 @@ """A GPU worker class.""" import gc import os -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -9,21 +9,20 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner -from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner +from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput -class Worker(WorkerBase): +class Worker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single GPU. The worker is responsible for @@ -78,9 +77,10 @@ class Worker(WorkerBase): or (speculative_config.draft_model_config.hf_config.model_type != "mlp_speculator") else {"return_hidden_states": True} - ModelRunnerClass = (EmbeddingModelRunner if - self.model_config.embedding_mode else ModelRunner) - self.model_runner = ModelRunnerClass( + ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner + if self.model_config.embedding_mode: + ModelRunnerClass = EmbeddingModelRunner + self.model_runner: GPUModelRunnerBase = ModelRunnerClass( model_config, parallel_config, scheduler_config, @@ -225,40 +225,18 @@ class Worker(WorkerBase): # the model initialization and profiling. set_random_seed(self.model_config.seed) - def cache_swap( - self, - blocks_to_swap_in: torch.Tensor, - blocks_to_swap_out: torch.Tensor, - blocks_to_copy: torch.Tensor, - ) -> None: - # Issue cache operations. - if blocks_to_swap_in.numel() > 0: - self.cache_engine.swap_in(blocks_to_swap_in) - if blocks_to_swap_out.numel() > 0: - self.cache_engine.swap_out(blocks_to_swap_out) - if blocks_to_copy.numel() > 0: - self.cache_engine.copy(blocks_to_copy) + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return self.gpu_cache @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[Union[SamplerOutput, PoolerOutput]]: - if not self.is_driver_worker: - self._execute_model_non_driver() - return [] - - if execute_model_req is None: - # This signals that there's no more requests to process for now. - # All workers are running infinite loop with broadcast_tensor_dict, - # and it stops the loop when the driver broadcasts an empty input. - # Send an empty input to notify all other workers to stop their - # execution loop. - broadcast_tensor_dict({}, src=0) - return [] - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - num_seq_groups = len(seq_group_metadata_list) + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + num_seq_groups = len(execute_model_req.seq_group_metadata_list) # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. # they contain parameters to launch cudamemcpyasync. blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, @@ -273,59 +251,26 @@ class Worker(WorkerBase): blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, device=self.device, dtype=torch.int64).view(-1, 2) - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - output = self.model_runner.execute_model(seq_group_metadata_list, - self.gpu_cache) - - # Worker only supports single-step execution. Wrap the output in a list - # to conform to interface. - return [output] + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) @torch.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop in parallel worker. - - You can stop the loop by executing a driver worker with an empty output. - See `stop_remote_worker_execution_loop` for more details. - """ - while self._execute_model_non_driver(): - pass - - def _execute_model_non_driver(self) -> bool: - """Execute model in parallel worker. - - Returns True iff there are remaining sequences to process. - """ - assert not self.is_driver_worker - data = broadcast_tensor_dict(src=0) - if not data: - return False - - num_seq_groups = data.get("num_seq_groups", 0) - blocks_to_swap_in = data.get("blocks_to_swap_in") - blocks_to_swap_out = data.get("blocks_to_swap_out") - blocks_to_copy = data.get("blocks_to_copy") - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return False - - self.model_runner.execute_model(None, self.gpu_cache) - return True + def execute_worker(self, worker_input: WorkerInput) -> None: + # Issue cache operations. + if (worker_input.blocks_to_swap_in is not None + and worker_input.blocks_to_swap_in.numel() > 0): + self.cache_engine.swap_in(worker_input.blocks_to_swap_in) + if (worker_input.blocks_to_swap_out is not None + and worker_input.blocks_to_swap_out.numel() > 0): + self.cache_engine.swap_out(worker_input.blocks_to_swap_out) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine.copy(worker_input.blocks_to_copy) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 99482aa93..1df60eb1f 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,20 +1,26 @@ +import dataclasses import importlib import os from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +import torch + +from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (enable_trace_function_call_for_thread, is_hip, update_environment_variables) +from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase logger = init_logger(__name__) class WorkerBase(ABC): """Worker interface that allows vLLM to cleanly separate implementations for - different hardware. + different hardware. Also abstracts control plane communication, e.g., to + communicate request metadata to other workers. """ @abstractmethod @@ -46,13 +52,23 @@ class WorkerBase(ABC): """ raise NotImplementedError + @torch.inference_mode() + def start_worker_execution_loop(self) -> None: + """Execute model loop in parallel worker. + + You can stop the loop by executing a driver worker with an empty output. + See `stop_remote_worker_execution_loop` for more details. + """ + while True: + output = self.execute_model(execute_model_req=None) + if output is None: + return None + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Executes at least one model step on the given sequences, unless no - sequences are provided.""" + ) -> Optional[List[SamplerOutput]]: raise NotImplementedError @abstractmethod @@ -98,6 +114,150 @@ class LoraNotSupportedWorkerBase(WorkerBase): raise ValueError(f"{type(self)} does not support LoRA") +@dataclasses.dataclass(frozen=True) +class WorkerInput: + """Local inputs to each worker. May contain device-specific data. These + fields should be broadcastable to other workers. + """ + + num_seq_groups: Optional[int] = None + blocks_to_swap_in: Optional[torch.Tensor] = None + blocks_to_swap_out: Optional[torch.Tensor] = None + blocks_to_copy: Optional[torch.Tensor] = None + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["WorkerInput"], + tensor_dict: Dict[str, Any], + ) -> "WorkerInput": + """ + Pop fields from the given tensor_dict and populate a new instance of + WorkerInput. + """ + return cls( + num_seq_groups=tensor_dict.pop("num_seq_groups"), + blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"), + blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"), + blocks_to_copy=tensor_dict.pop("blocks_to_copy"), + ) + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + """ + Extract broadcastable fields. + """ + tensor_dict = { + "num_seq_groups": self.num_seq_groups, + "blocks_to_swap_in": self.blocks_to_swap_in, + "blocks_to_swap_out": self.blocks_to_swap_out, + "blocks_to_copy": self.blocks_to_copy, + } + + return tensor_dict + + +class LocalOrDistributedWorkerBase(WorkerBase): + """ + Partial implementation of WorkerBase that has a default `execute_model` + definition to perform metadata transfer between workers when in distributed + mode. Subclasses of this interface should use model runners that inherit + from ModelRunnerBase, and should only need to implement worker-local logic. + If custom control plane logic is needed to transfer metadata, or if the + model runner cannot inherit from ModelRunnerBase, use WorkerBase instead. + """ + is_driver_worker: bool + model_runner: ModelRunnerBase + + @property + @abstractmethod + def do_metadata_broadcast(self) -> bool: + """ + Used by the default `execute_model` to check whether broadcast is + needed to transfer request inputs from the driver worker to other + workers in the TP group. If WorkerBase subclass only supports + single-worker execution, then this method should return False. + """ + raise NotImplementedError + + @property + @abstractmethod + def kv_cache(self) -> Optional[List[torch.Tensor]]: + """ + Get the kv cache to pass to the worker's model runner. Used by the + default `execute_model`. If the worker's model runner does not follow + the ModelRunnerBase interface, then inherit from WorkerBase instead. + """ + raise NotImplementedError + + @abstractmethod + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + """ + Prepare the inputs to WorkerBase.execute_worker from an execution + request. This method may move data to the worker's local device. It is + not allowed to communicate with other workers or devices. + """ + raise NotImplementedError + + @abstractmethod + def execute_worker(self, worker_input: WorkerInput) -> None: + """ + Process an execution request. + """ + raise NotImplementedError + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> Optional[List[SamplerOutput]]: + """Executes at least one model step on the given sequences, unless no + sequences are provided.""" + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + model_input: ModelRunnerInputBase = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list)) + + if self.do_metadata_broadcast: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update( + model_input.as_broadcastable_tensor_dict()) + broadcast_tensor_dict(broadcast_data, src=0) + else: + assert self.do_metadata_broadcast + broadcast_data = broadcast_tensor_dict(src=0) + if not broadcast_data: + return None + + worker_input = WorkerInput.from_broadcasted_tensor_dict( + broadcast_data) + model_input = ( + self.model_runner. + make_model_input_from_broadcasted_tensor_dict(broadcast_data)) + + self.execute_worker(worker_input) + + # If there is no input, we don't need to execute the model. + if worker_input.num_seq_groups == 0: + return [] + + output = self.model_runner.execute_model(model_input, self.kv_cache) + # Worker only supports single-step execution. Wrap the output in a + # list to conform to interface. + return [output] + + class WorkerWrapperBase: """ The whole point of this class is to lazily initialize the worker. diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index f30de703e..d9124a788 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -14,6 +15,15 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) @@ -24,7 +34,42 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ ] -class XPUModelRunner: +@dataclass(frozen=True) +class ModelInputForXPU(ModelRunnerInputBase): + """ + Used by the NeuronModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + attn_metadata: Optional["AttentionMetadata"] = None + sampling_metadata: Optional["SamplingMetadata"] = None + multi_modal_input: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["ModelInputForXPU"], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForXPU": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): def __init__( self, @@ -130,15 +175,22 @@ class XPUModelRunner: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - self.execute_model(seqs, kv_caches) + model_input = self.prepare_model_input(seqs) + self.execute_model(model_input, kv_caches) torch.xpu.synchronize() return - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU: + return (ModelInputForXPU.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Optional[torch.Tensor]]: + ) -> ModelInputForXPU: multi_modal_input = None if self.is_driver_worker: # NOTE: We assume that all sequences in the group are all prompts or @@ -185,8 +237,11 @@ class XPUModelRunner: num_prompts=0, ) - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, multi_modal_input) + return ModelInputForXPU(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + sampling_metadata=sampling_metadata, + multi_modal_input=multi_modal_input) def _prepare_decode( self, @@ -277,27 +332,25 @@ class XPUModelRunner: @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: ModelInputForXPU, kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - model_executable = self.model execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + execute_model_kwargs.update( + {"image_input": model_input.multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Only perform sampling in the driver worker. if not self.is_driver_worker: @@ -306,7 +359,7 @@ class XPUModelRunner: # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output -- GitLab From 3aa7b6cf66890c042ebecf9e8094f4f5e3dbf96e Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 25 Jun 2024 20:34:25 -0700 Subject: [PATCH 153/376] [Misc][Doc] Add Example of using OpenAI Server with VLM (#5832) --- docs/source/models/vlm.rst | 2 + examples/openai_vision_api_client.py | 90 ++++++++++++++++++++++++++++ vllm/multimodal/utils.py | 12 +++- 3 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 examples/openai_vision_api_client.py diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index de55a1a09..1837dd2aa 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -130,6 +130,8 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat response:", chat_response) +A full code example can be found in `examples/openai_vision_api_client.py `_. + .. note:: By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py new file mode 100644 index 000000000..26f2aa651 --- /dev/null +++ b/examples/openai_vision_api_client.py @@ -0,0 +1,90 @@ +"""An example showing how to use vLLM to serve VLMs. + +Launch the vLLM server with the following command: +python -m vllm.entrypoints.openai.api_server \ + --model llava-hf/llava-1.5-7b-hf \ + --image-input-type pixel_values \ + --image-token-id 32000 \ + --image-input-shape 1,3,336,336 \ + --image-feature-size 576 \ + --chat-template template_llava.jinja +""" +import base64 + +import requests +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +# Use image url in the payload +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_url.choices[0].message.content +print(f"Chat completion output:{result}") + + +# Use base64 encoded image in the payload +def encode_image_base64_from_url(image_url: str) -> str: + """Encode an image retrieved from a remote url to base64 format.""" + + with requests.get(image_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + +image_base64 = encode_image_base64_from_url(image_url=image_url) +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_base64.choices[0].message.content +print(f"Chat completion output:{result}") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 509f791d2..0cf2c057f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,6 +1,7 @@ import base64 from io import BytesIO from typing import Optional, Union +from urllib.parse import urlparse import aiohttp from PIL import Image @@ -28,6 +29,10 @@ class ImageFetchAiohttp: """Load PIL image from a url or base64 encoded openai GPT4V format""" if image_url.startswith('http'): + parsed_url = urlparse(image_url) + if parsed_url.scheme not in ["http", "https"]: + raise ValueError("Invalid 'image_url': A valid 'image_url' " + "must have scheme 'http' or 'https'.") # Avoid circular import from vllm import __version__ as VLLM_VERSION @@ -44,8 +49,9 @@ class ImageFetchAiohttp: image = load_image_from_base64(image_url.split(',', 1)[1]) else: - raise ValueError("Invalid image url: A valid image url must start " - "with either 'data:image' or 'http'.") + raise ValueError( + "Invalid 'image_url': A valid 'image_url' must start " + "with either 'data:image' or 'http'.") return image @@ -56,7 +62,7 @@ async def async_get_and_parse_image(image_url: str) -> ImagePixelData: def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str: - """encode image to base64 format.""" + """Encode a pillow image to base64 format.""" buffered = BytesIO() if format == 'JPEG': -- GitLab From 515080ad2fd93cc8e363ff43b90a9df18cfd71ff Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 25 Jun 2024 21:56:02 -0700 Subject: [PATCH 154/376] [bugfix][distributed] fix shm broadcast when the queue size is full (#5801) --- tests/distributed/test_shm_broadcast.py | 49 +++++++++---- .../device_communicators/shm_broadcast.py | 73 +++++++++++-------- 2 files changed, 76 insertions(+), 46 deletions(-) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index d92900ffc..2c2466f81 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,7 +1,9 @@ import multiprocessing import random import time +from typing import List +import numpy as np import torch.distributed as dist from vllm.distributed.device_communicators.shm_broadcast import ( @@ -9,6 +11,14 @@ from vllm.distributed.device_communicators.shm_broadcast import ( from vllm.utils import update_environment_variables +def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: + np.random.seed(seed) + sizes = np.random.randint(1, 10_000, n) + # on average, each array will have 5k elements + # with int64, each array will have 40kb + return [np.random.randint(1, 100, i) for i in sizes] + + def distributed_run(fn, world_size): number_of_processes = world_size processes = [] @@ -47,24 +57,31 @@ def worker_fn_wrapper(fn): def worker_fn(): writer_rank = 2 broadcaster = ShmRingBufferIO.create_from_process_group( - dist.group.WORLD, 1024, 2, writer_rank) + dist.group.WORLD, 1024 * 1024, 2, writer_rank) + if dist.get_rank() == writer_rank: + seed = random.randint(0, 1000) + dist.broadcast_object_list([seed], writer_rank) + else: + recv = [None] + dist.broadcast_object_list(recv, writer_rank) + seed = recv[0] # type: ignore + dist.barrier() + # in case we find a race condition + # print the seed so that we can reproduce the error + print(f"Rank {dist.get_rank()} got seed {seed}") + # test broadcasting with about 400MB of data + N = 10_000 if dist.get_rank() == writer_rank: - time.sleep(random.random()) - broadcaster.broadcast_object(0) - time.sleep(random.random()) - broadcaster.broadcast_object({}) - time.sleep(random.random()) - broadcaster.broadcast_object([]) + arrs = get_arrays(N, seed) + for x in arrs: + broadcaster.broadcast_object(x) + time.sleep(random.random() / 1000) else: - time.sleep(random.random()) - a = broadcaster.broadcast_object(None) - time.sleep(random.random()) - b = broadcaster.broadcast_object(None) - time.sleep(random.random()) - c = broadcaster.broadcast_object(None) - assert a == 0 - assert b == {} - assert c == [] + arrs = get_arrays(N, seed) + for x in arrs: + y = broadcaster.broadcast_object(None) + assert np.array_equal(x, y) + time.sleep(random.random() / 1000) dist.barrier() diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index c44bd2f11..550271f88 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -14,6 +14,12 @@ from vllm.logger import init_logger VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL +# time to wait if the queue is full or empty +# if we sleep for too short, it will consume too much CPU +# if we sleep for too long, it will slow down the writer/reader +# 0.1 us is a good balance +RINGBUFFER_SLEEP_INTERVAL = 1e-7 + logger = init_logger(__name__) @@ -145,8 +151,7 @@ class ShmRingBufferIO: @contextmanager def acquire_write(self): assert self._is_writer, "Only writers can acquire write" - start_index = self.current_idx - start_time = time.time() + start_time = time.monotonic() n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: @@ -154,19 +159,21 @@ class ShmRingBufferIO: written_flag = metadata_buffer[0] if written_flag and read_count != self.buffer.n_reader: # this block is written and not read by all readers - # try to write to the next block - self.current_idx = (self.current_idx + - 1) % self.buffer.max_chunks - if self.current_idx == start_index: - # no empty block found - if time.time( - ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa - logger.warning( - "No available block found in %s second. ", - VLLM_RINGBUFFER_WARNING_INTERVAL) - n_warning += 1 - # wait for a while (0.1 us) - time.sleep(1e-7) + # for writers, `self.current_idx` is the next block to write + # if this block is not ready to write, + # we need to wait until it is read by all readers + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + continue # found a block that is either # (1) not written @@ -188,13 +195,14 @@ class ShmRingBufferIO: metadata_buffer[i] = 0 # mark the block as written metadata_buffer[0] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks break @contextmanager def acquire_read(self): assert self._is_reader, "Only readers can acquire read" - start_index = self.current_idx - start_time = time.time() + start_time = time.monotonic() n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: @@ -204,19 +212,22 @@ class ShmRingBufferIO: # this block is either # (1) not written # (2) already read by this reader - # try to read the next block - self.current_idx = (self.current_idx + - 1) % self.buffer.max_chunks - if self.current_idx == start_index: - # no block found - if time.time( - ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa - logger.warning( - "No available block found in %s second. ", - VLLM_RINGBUFFER_WARNING_INTERVAL) - n_warning += 1 - # wait for a while (0.1 us) - time.sleep(1e-7) + + # for readers, `self.current_idx` is the next block to read + # if this block is not ready, + # we need to wait until it is written + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + continue # found a block that is not read by this reader # let caller read from the buffer @@ -226,6 +237,8 @@ class ShmRingBufferIO: # caller has read from the buffer # set the read flag metadata_buffer[self.reader_rank + 1] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks break def enqueue(self, obj): -- GitLab From 6806998bf9c7f24d710d9017c901e9e9a30757d5 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Jun 2024 00:15:22 -0700 Subject: [PATCH 155/376] [Bugfix] Fix embedding to support 2D inputs (#5829) --- vllm/model_executor/layers/vocab_parallel_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 1a26c5c63..4650b2c24 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -306,11 +306,11 @@ class VocabParallelEmbedding(torch.nn.Module): self.shard_indices.added_vocab_end_index) else: masked_input = input_ - # Get the embeddings. + # Get the embeddings. output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: - output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) + output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) # Reduce across all the model parallel GPUs. output = tensor_model_parallel_all_reduce(output_parallel) return output -- GitLab From 3439c5a8e3a1cdab9bf7c4430455ace06be1f28d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Jun 2024 00:58:23 -0700 Subject: [PATCH 156/376] [Bugfix][TPU] Fix KV cache size calculation (#5860) --- vllm/worker/tpu_worker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 828bb89d7..cd72c7119 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -118,14 +118,15 @@ class TPUWorker(LoraNotSupportedWorkerBase): xm.wait_device_ops() m = xm.get_memory_info(self.device) - program_size = 1024 * 1024 * 1024 # 1GB - free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0) - kv_cache_bytes = int(free_bytes * - self.cache_config.gpu_memory_utilization) - kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype) + total_memory_size = m["bytes_limit"] + usable_memory_size = int(total_memory_size * + self.cache_config.gpu_memory_utilization) + profiled = m["bytes_used"] # Weights + intermediate activations. + kv_cache_bytes = max(usable_memory_size - profiled, 0) + dtype_btyes = get_dtype_size(self.cache_dtype) block_size = self.cache_config.block_size num_tpu_blocks = (kv_cache_bytes // - (kv_cache_dtype_btyes * block_size * num_layers * 2 * + (dtype_btyes * block_size * num_layers * 2 * head_size * num_kv_heads)) num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. return num_tpu_blocks, 0 -- GitLab From 6984c02a2735d4d08426d2c426c34b6d73bee89e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 26 Jun 2024 16:02:34 +0800 Subject: [PATCH 157/376] [CI/Build] Refactor image test assets (#5821) --- tests/conftest.py | 111 ++++++++++++++++++----------- tests/models/test_llava.py | 26 +++---- tests/models/test_llava_next.py | 30 ++++---- tests/models/test_phi3v.py | 28 ++++---- tests/multimodal/test_processor.py | 24 +++---- 5 files changed, 127 insertions(+), 92 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 67885b932..9d00c7676 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,12 @@ import contextlib import gc import os -from typing import Any, Dict, List, Optional, Tuple, TypeVar +from collections import UserList +from dataclasses import dataclass +from functools import cached_property +from pathlib import Path +from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, + TypeVar) import pytest import torch @@ -28,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -# Multi modal related -# You can use `.buildkite/download-images.sh` to download the assets -PIXEL_VALUES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"] -] -IMAGE_FEATURES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"] -] -IMAGE_FILES = [ - os.path.join(_TEST_DIR, "images", filename) - for filename in ["stop_sign.jpg", "cherry_blossom.jpg"] -] -assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES) +_IMAGE_DIR = Path(_TEST_DIR) / "images" +"""You can use `.buildkite/download-images.sh` to download the assets.""" def _read_prompts(filename: str) -> List[str]: @@ -51,6 +43,63 @@ def _read_prompts(filename: str) -> List[str]: return prompts +@dataclass(frozen=True) +class ImageAsset: + name: Literal["stop_sign", "cherry_blossom"] + + @cached_property + def pixel_values(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt") + + @cached_property + def image_features(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt") + + @cached_property + def pil_image(self) -> Image.Image: + return Image.open(_IMAGE_DIR / f"{self.name}.jpg") + + def for_hf(self) -> Image.Image: + return self.pil_image + + def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData: + image_input_type = vision_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if image_input_type == ImageInputType.IMAGE_FEATURES: + return ImageFeatureData(self.image_features) + if image_input_type == ImageInputType.PIXEL_VALUES: + return ImagePixelData(self.pil_image) + + raise NotImplementedError + + +class _ImageAssetPrompts(TypedDict): + stop_sign: str + cherry_blossom: str + + +class _ImageAssets(UserList[ImageAsset]): + + def __init__(self) -> None: + super().__init__( + [ImageAsset("stop_sign"), + ImageAsset("cherry_blossom")]) + + def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: + """ + Convenience method to define the prompt for each test image. + + The order of the returned prompts matches the order of the + assets when iterating through this object. + """ + return [prompts["stop_sign"], prompts["cherry_blossom"]] + + +IMAGE_ASSETS = _ImageAssets() +"""Singleton instance of :class:`_ImageAssets`.""" + + def cleanup(): destroy_model_parallel() destroy_distributed_environment() @@ -81,31 +130,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): cleanup() -@pytest.fixture(scope="session") -def hf_images() -> List[Image.Image]: - return [Image.open(filename) for filename in IMAGE_FILES] - - -@pytest.fixture() -def vllm_images(request) -> List[MultiModalData]: - vision_language_config = request.getfixturevalue("model_and_config")[1] - if vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.IMAGE_FEATURES): - return [ - ImageFeatureData(torch.load(filename)) - for filename in IMAGE_FEATURES_FILES - ] - else: - return [ - ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES - ] - - -@pytest.fixture() -def vllm_image_tensors(request) -> List[torch.Tensor]: - return [torch.load(filename) for filename in PIXEL_VALUES_FILES] - - @pytest.fixture def example_prompts() -> List[str]: prompts = [] @@ -122,6 +146,11 @@ def example_long_prompts() -> List[str]: return prompts +@pytest.fixture(scope="session") +def image_assets() -> _ImageAssets: + return IMAGE_ASSETS + + _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index b41c69f72..ac1d2ece6 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -5,17 +5,17 @@ from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "\nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": "\nUSER: What is the season?\nASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +}) def iter_llava_configs(model_name: str): @@ -49,28 +49,28 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -81,6 +81,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index 0eca5cb53..d36e50387 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -5,7 +5,7 @@ from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm @@ -15,12 +15,12 @@ _PREFACE = ( "questions.") # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ - f"{_PREFACE} \nUSER: What's the content of the image? ASSISTANT:", - f"{_PREFACE} \nUSER: What is the season? ASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + f"{_PREFACE} \nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": + f"{_PREFACE} \nUSER: What is the season?\nASSISTANT:", +}) def iter_llava_next_configs(model_name: str): @@ -56,20 +56,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str @pytest.mark.xfail( @@ -78,8 +78,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -90,6 +90,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index a29d50df4..03c130466 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -6,17 +6,17 @@ from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig from vllm.utils import is_cpu -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 - "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + "cherry_blossom": + "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", # noqa: E501 +}) def iter_phi3v_configs(model_name: str): @@ -50,22 +50,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id if input_id != image_token_id else 0 - for idx, input_id in enumerate(input_ids) + hf_output_ids = [ + token_id if token_id != image_token_id else 0 + for idx, token_id in enumerate(output_ids) ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") \ .replace("", " ").replace("<|user|>", "") \ .replace("<|end|>\n<|assistant|>", " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str target_dtype = "half" @@ -82,8 +82,8 @@ if is_cpu(): @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -94,6 +94,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] # use eager mode for hf runner, since phi3_v didn't work with flash_attn hf_model_kwargs = {"_attn_implementation": "eager"} diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py index 51c352361..9ac48dfab 100644 --- a/tests/multimodal/test_processor.py +++ b/tests/multimodal/test_processor.py @@ -10,7 +10,7 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_clip_image_processor(hf_images, dtype): +def test_clip_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype): reason="Inconsistent image processor being used due to lack " "of support for dynamic image token replacement") @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_llava_next_image_processor(hf_images, dtype): +def test_llava_next_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-v1.6-34b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype): @pytest.mark.xfail( reason="Example image pixels were not processed using HuggingFace") @pytest.mark.parametrize("dtype", ["float"]) -def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): +def test_image_pixel_types(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): image_processor_revision=None, ) - for image, tensor in zip(hf_images, vllm_image_tensors): + for asset in image_assets: image_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) tensor_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(tensor), + ImagePixelData(asset.pixel_values), model_config=model_config, vlm_config=vlm_config, ) -- GitLab From 5bfd1bbc9831fed39632f071f16bb62373ec1249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Wed, 26 Jun 2024 11:16:00 -0400 Subject: [PATCH 158/376] [Kernel] Adding bias epilogue support for `cutlass_scaled_mm` (#5560) Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Lucas Wilkinson --- CMakeLists.txt | 3 +- csrc/ops.h | 3 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 228 +++++++++++++----- .../cutlass_w8a8/scaled_mm_c3x.cu | 139 ++++++++--- .../cutlass_w8a8/scaled_mm_entry.cu | 32 ++- csrc/torch_bindings.cpp | 2 +- tests/kernels/test_cutlass.py | 100 +++++--- vllm/_custom_ops.py | 10 +- 8 files changed, 383 insertions(+), 134 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 801429096..ede9192cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda") +# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) +set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") diff --git a/csrc/ops.h b/csrc/ops.h index 6f0a7143c..ae04150ea 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -96,7 +96,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales); + torch::Tensor const& b_scales, + c10::optional const& bias); #endif diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 38a20a172..6ce25c5ac 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel { }; /* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ + * This class provides the common ScaleA and ScaleB descriptors for the + * ScaledEpilogue and ScaledEpilogueBias classes. + */ template -struct ScaledEpilogue { - private: +struct ScaledEpilogueBase { + protected: using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< @@ -102,6 +90,32 @@ struct ScaledEpilogue { using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< OutputTileThreadMap, float, Stride, Int<1>, Int<0>>>; +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< cutlass::multiplies, float, float, @@ -134,6 +148,53 @@ struct ScaledEpilogue { } }; +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, ElementD, Stride, Int<1>, Int<0>>>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + using BiasArgs = typename Bias::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + BiasArgs bias_args{static_cast(bias.data_ptr()), {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args, + bias_args}; + return evt_compute_args; + } +}; + template typename ArchGuard, typename ElementAB_, typename ElementD_, template typename Epilogue_, typename TileShape, @@ -168,13 +229,13 @@ struct cutlass_2x_gemm { // clang-format off using RowMajor = typename cutlass::layout::RowMajor; using ColumnMajor = typename cutlass::layout::ColumnMajor; - using KernelType = + using KernelType = ArchGuard typename Epilogue, + typename... EpilogueArgs> +void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_args) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -420,78 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, if (out.dtype() == torch::kBFloat16) { return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + Epilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + Epilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, std::forward(epilogue_args)...); } } -void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, +void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - TORCH_CHECK(a.dtype() == torch::kInt8); - TORCH_CHECK(b.dtype() == torch::kInt8); + torch::Tensor const& b_scales, + c10::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + if (bias) { + TORCH_CHECK(bias->dtype() == out.dtype(), + "currently bias dtype must match output dtype ", out.dtype()); + return cutlass_scaled_mm_sm75_epilogue( + out, a, b, a_scales, b_scales, *bias); + } else { + return cutlass_scaled_mm_sm75_epilogue(out, a, b, a_scales, + b_scales); + } +} + +template